Repository: KaidiXu/auto_LiRPA
Branch: master
Commit: ca767f1d8c0a
Files: 249
Total size: 1.5 MB

Directory structure:
gitextract_yr8n0etx/

├── .github/
│   └── ISSUE_TEMPLATE/
│       └── bug_report.md
├── .gitignore
├── .readthedocs.yaml
├── CONTRIBUTORS
├── LICENSE
├── README.md
├── auto_LiRPA/
│   ├── __init__.py
│   ├── backward_bound.py
│   ├── beta_crown.py
│   ├── bound_general.py
│   ├── bound_multi_gpu.py
│   ├── bound_op_map.py
│   ├── bound_ops.py
│   ├── bounded_tensor.py
│   ├── concretize_bounds.py
│   ├── concretize_func.py
│   ├── cuda/
│   │   ├── cuda_kernels.cu
│   │   └── cuda_utils.cpp
│   ├── cuda_utils.py
│   ├── edit_graph.py
│   ├── eps_scheduler.py
│   ├── forward_bound.py
│   ├── interval_bound.py
│   ├── jacobian.py
│   ├── linear_bound.py
│   ├── operators/
│   │   ├── __init__.py
│   │   ├── activation_base.py
│   │   ├── activations.py
│   │   ├── add_sub.py
│   │   ├── base.py
│   │   ├── bivariate.py
│   │   ├── clampmult.py
│   │   ├── constant.py
│   │   ├── convex_concave.py
│   │   ├── convolution.py
│   │   ├── cut_ops.py
│   │   ├── dropout.py
│   │   ├── dtype.py
│   │   ├── gelu.py
│   │   ├── indexing.py
│   │   ├── jacobian.py
│   │   ├── leaf.py
│   │   ├── linear.py
│   │   ├── logical.py
│   │   ├── minmax.py
│   │   ├── normalization.py
│   │   ├── pooling.py
│   │   ├── reduce.py
│   │   ├── relu.py
│   │   ├── reshape.py
│   │   ├── resize.py
│   │   ├── rnn.py
│   │   ├── s_shaped.py
│   │   ├── shape.py
│   │   ├── slice_concat.py
│   │   ├── softmax.py
│   │   ├── solver_utils.py
│   │   ├── tile.py
│   │   └── trigonometric.py
│   ├── opt_pruner.py
│   ├── optimize_graph.py
│   ├── optimized_bounds.py
│   ├── output_constraints.py
│   ├── parse_graph.py
│   ├── patches.py
│   ├── perturbations.py
│   ├── solver_module.py
│   ├── tools.py
│   ├── utils.py
│   └── wrapper.py
├── doc/
│   ├── .gitignore
│   ├── Makefile
│   ├── README.md
│   ├── api.rst
│   ├── conf.py
│   ├── index.rst
│   └── process.py
├── examples/
│   ├── .gitignore
│   ├── __init__.py
│   ├── language/
│   │   ├── .gitignore
│   │   ├── Transformer/
│   │   │   ├── Transformer.py
│   │   │   ├── __init__.py
│   │   │   ├── modeling.py
│   │   │   └── utils.py
│   │   ├── data_utils.py
│   │   ├── language_utils.py
│   │   ├── lstm.py
│   │   ├── oracle.py
│   │   ├── preprocess/
│   │   │   ├── pre_compute_lm_scores.py
│   │   │   └── preprocess_sst.py
│   │   └── train.py
│   ├── sequence/
│   │   ├── .gitignore
│   │   ├── __init__.py
│   │   ├── data_utils.py
│   │   ├── lstm.py
│   │   └── train.py
│   ├── simple/
│   │   ├── invprop.py
│   │   ├── lp_full.py
│   │   ├── mip_lp_solver.py
│   │   ├── models/
│   │   │   └── spectral_NOR_MLP_B.pth
│   │   └── toy.py
│   └── vision/
│       ├── .gitignore
│       ├── bound_option.py
│       ├── cifar_training.py
│       ├── custom_op.py
│       ├── data/
│       │   ├── .gitignore
│       │   ├── ImageNet64/
│       │   │   └── imagenet_data_loader.py
│       │   └── tinyImageNet/
│       │       ├── .gitignore
│       │       └── tinyimagenet_download.sh
│       ├── datasets.py
│       ├── efficient_convolution.py
│       ├── imagenet_training.py
│       ├── jacobian.py
│       ├── models/
│       │   ├── __init__.py
│       │   ├── densenet.py
│       │   ├── densenet_imagenet.py
│       │   ├── densenet_no_bn.py
│       │   ├── feedforward.py
│       │   ├── mobilenet.py
│       │   ├── resnet.py
│       │   ├── resnet18.py
│       │   ├── resnext.py
│       │   ├── resnext_imagenet64.py
│       │   ├── vnncomp_resnet.py
│       │   ├── wide_resnet_cifar.py
│       │   └── wide_resnet_imagenet64.py
│       ├── pretrained/
│       │   ├── cifar_2c2f.pth
│       │   ├── kw_mnist.pth
│       │   ├── mnist_a_adv.pth
│       │   ├── mnist_cnn_small.pth
│       │   ├── mnist_fc_3layer.pth
│       │   └── test_min_max.pth
│       ├── save_intermediate_bound.py
│       ├── simple_training.py
│       ├── simple_verification.py
│       ├── tinyimagenet_training.py
│       ├── verify_two_node.py
│       └── weight_perturbation_training.py
├── setup.py
└── tests/
    ├── .gitignore
    ├── data/
    │   ├── .gitignore
    │   ├── avgpool_test_data
    │   ├── beta_crown_test_data
    │   ├── bound_ops_data
    │   ├── ckpt_lstm
    │   ├── ckpt_transformer
    │   ├── constant_test_data
    │   ├── conv1d_test_data_3-0-2
    │   ├── conv1d_test_data_3-0-3
    │   ├── conv1d_test_data_3-1-2
    │   ├── conv1d_test_data_3-1-3
    │   ├── conv1d_test_data_4-0-2
    │   ├── conv1d_test_data_4-0-3
    │   ├── conv1d_test_data_4-1-2
    │   ├── conv1d_test_data_4-1-3
    │   ├── distinct_patches_test_data
    │   ├── invprop/
    │   │   ├── ood.onnx
    │   │   ├── ood_reference
    │   │   └── simple_reference
    │   ├── jacobian_test_data
    │   ├── language_test_data
    │   ├── maxpool_test_data_3-0-3-0
    │   ├── maxpool_test_data_3-0-3-1
    │   ├── maxpool_test_data_3-1-3-0
    │   ├── maxpool_test_data_3-1-3-1
    │   ├── maxpool_test_data_4-0-4-0
    │   ├── maxpool_test_data_4-0-4-1
    │   ├── maxpool_test_data_4-1-4-0
    │   ├── maxpool_test_data_4-1-4-1
    │   ├── min_max_test_data
    │   ├── rectangle_patches_test_data
    │   ├── resnet_patches_test_data
    │   ├── s_shape_test_data
    │   ├── test_constrained_concretize
    │   ├── test_general_shape_data
    │   ├── test_perturbation_data
    │   ├── test_save_data
    │   ├── vision_clip_test_data
    │   ├── vision_test_data
    │   └── weight_perturbation_test_data
    ├── data_64/
    │   ├── avgpool_test_data
    │   ├── bound_ops_data
    │   ├── constant_test_data
    │   ├── conv1d_test_data_3-0-2
    │   ├── conv1d_test_data_3-0-3
    │   ├── conv1d_test_data_3-1-2
    │   ├── conv1d_test_data_3-1-3
    │   ├── conv1d_test_data_4-0-2
    │   ├── conv1d_test_data_4-0-3
    │   ├── conv1d_test_data_4-1-2
    │   ├── conv1d_test_data_4-1-3
    │   ├── general_shape_data
    │   ├── invprop/
    │   │   ├── ood_reference
    │   │   └── simple_reference
    │   ├── jacobian_test_data
    │   ├── maxpool_test_data_3-0-3-0
    │   ├── maxpool_test_data_3-0-3-1
    │   ├── maxpool_test_data_3-1-3-0
    │   ├── maxpool_test_data_3-1-3-1
    │   ├── maxpool_test_data_4-0-4-0
    │   ├── maxpool_test_data_4-0-4-1
    │   ├── maxpool_test_data_4-1-4-0
    │   ├── maxpool_test_data_4-1-4-1
    │   ├── min_max_test_data
    │   ├── rectangle_patches_test_data
    │   ├── resnet_patches_test_data
    │   ├── s_shape_test_data
    │   ├── test_constrained_concretize
    │   ├── test_general_shape_data
    │   ├── test_save_data
    │   ├── vision_clip_test_data
    │   ├── vision_test_data
    │   └── weight_perturbation_test_data
    ├── test_1d_activation.py
    ├── test_2d_activation.py
    ├── test_avgpool.py
    ├── test_bound_ops.py
    ├── test_branching_heuristics.py
    ├── test_clip_domains.py
    ├── test_constant.py
    ├── test_constrained_concretize.py
    ├── test_conv.py
    ├── test_conv1d.py
    ├── test_distinct_patches.py
    ├── test_examples.py
    ├── test_examples_ci.py
    ├── test_general_nonlinear.py
    ├── test_general_shape.py
    ├── test_identity.py
    ├── test_invprop.py
    ├── test_jacobian.py
    ├── test_language_models.py
    ├── test_linear_cnn_model.py
    ├── test_linear_model.py
    ├── test_maxpool.py
    ├── test_min_max.py
    ├── test_perturbation.py
    ├── test_rectangle_patches.py
    ├── test_resnet_patches.py
    ├── test_s_shaped.py
    ├── test_save_intermediate.py
    ├── test_simple_verification.py
    ├── test_state_dict_name.py
    ├── test_tensor_storage.py
    ├── test_upsample.py
    ├── test_vision_models.py
    ├── test_vision_models_hardtanh.py
    ├── test_weight_perturbation.py
    └── testcase.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.md
================================================
---
name: Bug report
about: Create a report to help us improve
title: ''
labels: ''
assignees: ''

---

**Describe the bug**
A clear and concise description of what the bug is.

**To Reproduce**
Please provide us with the following to receive timely help:
1. A minimum example to reproduce the bug. Keep your code as short as possible but still directly runnable.
2. Model files, especially when the bug is only triggered on specific models.
3. **Complete** outputs of the program when the bug is triggered. Please do **not** just include the last few lines. If it's very long, you can use [PasteBin](https://pastebin.com/) or upload to a file-sharing service.
4. Detailed instructions to reproduce the problem. If you changed part of our tool, please rebase your changes to main branch and push your changes to a fork so we can investigate easier.

Without the above information, you might not be able to receive timely help from us.


**System configuration:**
 - OS: [e.g. Ubuntu 22.04. Windows and MacOS are not supported.]
 - Python version: [e.g., Python 3.8]
 - Pytorch Version: [e.g., PyTorch 1.12]
 - Hardware: [e.g., RTX 4090]
 - Have you tried to reproduce the problem in a cleanly created conda/virtualenv environment using official installation instructions and the latest code on the main branch?: [Yes/No]

**Screenshots**
If applicable, add screenshots to help explain your problem.

**Additional context**
Add any other context about the problem here.


================================================
FILE: .gitignore
================================================
tmp
build
__pycache__
*.egg-info
dist
*.swp
*.swo
*.log
.trace_graph
Verified_ret*.npy
Verified-acc*.npy
vnn-comp_*.npz
*.tar.gz
verifier_log_*
.vscode/
*.pt
.idea
*.so
release
*.compiled
.DS_Store
*.out
*.txt
release
release_abcrown
cachier
out.csv
results.csv


================================================
FILE: .readthedocs.yaml
================================================
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details

# Required
version: 2

# Set the version of Python and other tools you might need
build:
  os: ubuntu-20.04
  tools:
    python: "3.11"

# Build documentation in the docs/ directory with Sphinx
sphinx:
   configuration: doc/conf.py

# Optionally declare the Python requirements required to build your docs
python:
   install:
    - method: pip
      path: .
    - requirements: doc/requirements.txt

================================================
FILE: CONTRIBUTORS
================================================
Team leaders:
* Faculty: Huan Zhang (huan@huan-zhang.com), UIUC
* Student: Xiangru Zhong (xiangru4@illinois.edu), UIUC

Current developers (* indicates members of VNN-COMP 2025 team):
* \*Duo Zhou (duozhou2@illinois.edu), UIUC
* \*Keyi Shen (keyis2@illinois.edu), UIUC (graduated, now at Georgia Tech)
* \*Hesun Chen (hesunc2@illinois.edu), UIUC
* \*Haoyu Li (haoyuli5@illinois.edu), UIUC
* \*Ruize Gao (ruizeg2@illinois.edu), UIUC
* \*Hao Cheng (haoc539@illinois.edu), UIUC
* Zhouxing Shi (zhouxingshichn@gmail.com), UCLA/UC Riverside
* Lei Huang (leih5@illinois.edu), UIUC
* Taobo Liao (taobol2@illinois.edu), UIUC
* Jorge Chavez (jorgejc2@illinois.edu), UIUC

Past developers:
* Hongji Xu (hx84@duke.edu), Duke University (intern with Prof. Huan Zhang)
* Christopher Brix (brix@cs.rwth-aachen.de), RWTH Aachen University
* Hao Chen (haoc8@illinois.edu), UIUC
* Keyu Lu (keyulu2@illinois.edu), UIUC
* Kaidi Xu (kx46@drexel.edu), Drexel University
* Sanil Chawla (schawla7@illinois.edu), UIUC
* Linyi Li (linyi2@illinois.edu), UIUC
* Zhuolin Yang (zhuolin5@illinois.edu), UIUC
* Zhuowen Yuan (realzhuowen@gmail.com), UIUC
* Qirui Jin (qiruijin@umich.edu), University of Michigan
* Shiqi Wang (sw3215@columbia.edu), Columbia University
* Yihan Wang (yihanwang@ucla.edu), UCLA
* Jinqi (Kathryn) Chen (jinqic@cs.cmu.edu), CMU

auto_LiRPA is currently supported in part by the National Science Foundation (NSF; award 2331967, 2525287), the AI2050 program at Schmidt Science, the Virtual Institute for Scientific Software (VISS) at Georgia Tech, the University Research Program at Toyota Research Institute (TRI), and a Mathworks research award.

The team acknowledges the financial and advisory support from Prof. Zico Kolter (zkolter@cs.cmu.edu), Prof. Cho-Jui Hsieh (chohsieh@cs.ucla.edu), Prof. Suman Jana (suman@cs.columbia.edu), Prof. Bo Li (lbo@illinois.edu), and Prof. Xue Lin (xue.lin@northeastern.edu) during 2021 - 2023.


================================================
FILE: LICENSE
================================================
Copyright (C) 2021-2025 The α,β-CROWN Team
See CONTRIBUTORS for the list of all contributors and their affiliations.
    Team leaders: 
         Faculty: Huan Zhang <huan@huan-zhang.com> (UIUC)
         Student: Xiangru Zhong <xiangru4@illinois.edu> (UIUC)
    Current developers:
                  Duo Zhou <duozhou2@illinois.edu> (UIUC)
                  Keyi Shen <keyis2@illinois.edu> (UIUC/Georgia Tech)
                  Hesun Chen <hesunc2@illinois.edu> (UIUC)
                  Haoyu Li <haoyuli5@illinois.edu> (UIUC)
                  Ruize Gao <ruizeg2@illinois.edu> (UIUC)
                  Hao Cheng <haoc539@illinois.edu> (UIUC)
                  Zhouxing Shi <zhouxingshichn@gmail.com> (UCLA/UC Riverside)
                  Lei Huang <leih5@illinois.edu> (UIUC)
                  Taobo Liao <taobol2@illinois.edu> (UIUC)
                  Jorge Chavez <jorgejc2@illinois.edu> (UIUC)

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.

3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


================================================
FILE: README.md
================================================
# auto_LiRPA: Automatic Linear Relaxation based Perturbation Analysis for Neural Networks

[![Documentation Status](https://readthedocs.org/projects/auto-lirpa/badge/?version=latest)](https://auto-lirpa.readthedocs.io/en/latest/?badge=latest)
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](http://PaperCode.cc/AutoLiRPA-Demo)
[![Video Introduction](https://img.shields.io/badge/play-video-red.svg)](http://PaperCode.cc/AutoLiRPA-Video)
[![BSD license](https://img.shields.io/badge/License-BSD-blue.svg)](https://opensource.org/licenses/BSD-3-Clause)

<p align="center">
<a href="http://PaperCode.cc/AutoLiRPA-Video"><img src="http://www.huan-zhang.com/images/upload/lirpa/auto_lirpa_2.png" width="45%" height="45%" float="left"></a>
<a href="http://PaperCode.cc/AutoLiRPA-Video"><img src="http://www.huan-zhang.com/images/upload/lirpa/auto_lirpa_1.png" width="45%" height="45%" float="right"></a>
</p>

## What's New?
- [α,β-CROWN](https://github.com/Verified-Intelligence/alpha-beta-CROWN.git) (using `auto_LiRPA` as its core library) is the winner of [VNN-COMP 2025](https://sites.google.com/view/vnn2025) and is **ranked top-1** in all [scored benchmarks](https://github.com/VNN-COMP/vnncomp2025_results/blob/main/SCORING-SMALL-TOL/latex/main.pdf). (08/2025)
- Bounding of computation graphs containing Jacobian operators now supports more nonlinear operators (e.g., ```tanh```, ```sigmoid```), enabling verification of [continuous-time Lyapunov stability](https://github.com/Verified-Intelligence/Two-Stage_Neural_Controller_Training). (12/2025)
- [α,β-CROWN](https://github.com/Verified-Intelligence/alpha-beta-CROWN.git) (using `auto_LiRPA` as its core library) is the winner of [VNN-COMP 2024](https://sites.google.com/view/vnn2024). Our tool is **ranked top-1** in all benchmarks (including 12 [regular track](https://github.com/ChristopherBrix/vnncomp2024_results/blob/main/SCORING/latex/results_regular_track.pdf) and 9 [extended track](https://github.com/ChristopherBrix/vnncomp2024_results/blob/main/SCORING/latex/results_extended_track.pdf) benchmarks). (08/2024)
- The [INVPROP algorithm](https://arxiv.org/pdf/2302.01404.pdf) allows to compute overapproximationsw of preimages (the set of inputs of an NN generating a given output set) and tighten bounds using output constraints. (03/2024)
- Branch-and-bound support for non-ReLU and general nonlinearities ([GenBaB](https://arxiv.org/pdf/2405.21063)) with optimizable bounds (α-CROWN) for new nonlinear functions (sin, cos, GeLU). We achieve significant improvements on verifying neural networks with non-ReLU nonlinearities such as Transformers, LSTM, and [ML4ACOPF](https://github.com/AI4OPT/ml4acopf_benchmark). (09/2023)
- [α,β-CROWN](https://github.com/Verified-Intelligence/alpha-beta-CROWN.git) ([alpha-beta-CROWN](https://github.com/Verified-Intelligence/alpha-beta-CROWN.git)) (using `auto_LiRPA` as its core library) **won** [VNN-COMP 2023](https://sites.google.com/view/vnn2023). (08/2023)
- Bound computation for higher-order computational graphs to support bounding Jacobian, Jacobian-vector products, and [local Lipschitz constants](https://arxiv.org/abs/2210.07394). (11/2022)
- Our neural network verification tool [α,β-CROWN](https://github.com/Verified-Intelligence/alpha-beta-CROWN.git) ([alpha-beta-CROWN](https://github.com/Verified-Intelligence/alpha-beta-CROWN.git)) (using `auto_LiRPA` as its core library) **won** [VNN-COMP 2022](https://sites.google.com/view/vnn2022). Our library supports the large CIFAR100, TinyImageNet and ImageNet models in VNN-COMP 2022. (09/2022)
- Implementation of **general cutting planes** ([GCP-CROWN](https://arxiv.org/pdf/2208.05740.pdf)), support of more activation functions and improved performance and scalability. (09/2022)
- Our neural network verification tool [α,β-CROWN](https://github.com/Verified-Intelligence/alpha-beta-CROWN.git) ([alpha-beta-CROWN](https://github.com/Verified-Intelligence/alpha-beta-CROWN.git)) **won** [VNN-COMP 2021](https://sites.google.com/view/vnn2021) **with the highest total score**, outperforming 11 SOTA verifiers. α,β-CROWN uses the `auto_LiRPA` library as its core bound computation library. (09/2021)
- [Optimized CROWN/LiRPA](https://arxiv.org/pdf/2011.13824.pdf) bound (α-CROWN) for ReLU, **sigmoid**, **tanh**, and **maxpool** activation functions, which can significantly outperform regular CROWN bounds. See [simple_verification.py](examples/vision/simple_verification.py#L59) for an example. (07/31/2021)
- Handle split constraints for ReLU neurons ([β-CROWN](https://arxiv.org/pdf/2103.06624.pdf)) for complete verifiers. (07/31/2021)
- A memory efficient GPU implementation of backward (CROWN) bounds for
convolutional layers. (10/31/2020)
- Certified defense models for downscaled ImageNet, TinyImageNet, CIFAR-10, LSTM/Transformer. (08/20/2020)
- Adding support to **complex vision models** including DenseNet, ResNeXt and WideResNet. (06/30/2020)
- **Loss fusion**, a technique that reduces training cost of tight LiRPA bounds
(e.g. CROWN-IBP) to the same asymptotic complexity of IBP, making LiRPA based certified
defense scalable to large datasets (e.g., TinyImageNet, downscaled ImageNet). (06/30/2020)
- **Multi-GPU** support to scale LiRPA based training to large models and datasets. (06/30/2020)
- Initial release. (02/28/2020)

## Introduction

`auto_LiRPA` is a library for automatically deriving and computing bounds with
linear relaxation based perturbation analysis (LiRPA) (e.g.
[CROWN](https://arxiv.org/pdf/1811.00866.pdf) and
[DeepPoly](https://files.sri.inf.ethz.ch/website/papers/DeepPoly.pdf)) for
neural networks, which is a useful tool for formal robustness verification. We
generalize existing LiRPA algorithms for feed-forward neural networks to a
graph algorithm on general computational graphs, defined by PyTorch.
Additionally, our implementation is also automatically **differentiable**,
allowing optimizing network parameters to shape the bounds into certain
specifications (e.g., certified defense). You can find [a video ▶️ introduction
here](http://PaperCode.cc/AutoLiRPA-Video).

Our library supports the following algorithms:

* Backward mode LiRPA bound propagation ([CROWN](https://arxiv.org/pdf/1811.00866.pdf)/[DeepPoly](https://files.sri.inf.ethz.ch/website/papers/DeepPoly.pdf))
* Backward mode LiRPA bound propagation with optimized bounds ([α-CROWN](https://arxiv.org/pdf/2011.13824.pdf))
* Backward mode LiRPA bound propagation with split constraints ([β-CROWN](https://arxiv.org/pdf/2103.06624.pdf) for ReLU, and [GenBaB](https://arxiv.org/pdf/2405.21063) for general nonlinear functions)
* Generalized backward mode LiRPA bound propagation with general cutting plane constraints ([GCP-CROWN](https://arxiv.org/pdf/2208.05740.pdf))
* Backward mode LiRPA bound propagation with bounds tightened using output constraints ([INVPROP](https://arxiv.org/pdf/2302.01404.pdf))
* Generalized backward mode LiRPA bound propagation for higher-order computational graphs  ([Shi et al., 2022](https://arxiv.org/abs/2210.07394))
* Forward mode LiRPA bound propagation ([Xu et al., 2020](https://arxiv.org/pdf/2002.12920))
* Forward mode LiRPA bound propagation with optimized bounds (similar to [α-CROWN](https://arxiv.org/pdf/2011.13824.pdf))
* Interval bound propagation ([IBP](https://arxiv.org/pdf/1810.12715.pdf))
* Hybrid approaches, e.g., Forward+Backward, IBP+Backward ([CROWN-IBP](https://arxiv.org/pdf/1906.06316.pdf)), [α,β-CROWN](https://github.com/Verified-Intelligence/alpha-beta-CROWN.git) ([alpha-beta-CROWN](https://github.com/Verified-Intelligence/alpha-beta-CROWN.git))
* MIP/LP formulation of neural networks

Our library allows automatic bound derivation and computation for general
computational graphs, in a similar manner that gradients are obtained in modern
deep learning frameworks -- users only define the computation in a forward
pass, and `auto_LiRPA` traverses through the computational graph and derives
bounds for any nodes on the graph.  With `auto_LiRPA` we free users from
deriving and implementing LiPRA for most common tasks, and they can simply
apply LiPRA as a tool for their own applications.  This is especially useful
for users who are not experts of LiRPA and cannot derive these bounds manually
(LiRPA is significantly more complicated than backpropagation).

## Technical Background in 1 Minute

Deep learning frameworks such as PyTorch represent neural networks (NN) as
a computational graph, where each mathematical operation is a node and edges
define the flow of computation:

<p align="center">
<a href="http://PaperCode.cc/AutoLiRPA-Video"><img src="http://www.huan-zhang.com/images/upload/lirpa/auto_LiRPA_background_1.png" width="80%"></a>
</p>

Normally, the inputs of a computation graph (which defines a NN) are data and
model weights, and PyTorch goes through the graph and produces model prediction
(a bunch of numbers):

<p align="center">
<a href="http://PaperCode.cc/AutoLiRPA-Video"><img src="http://www.huan-zhang.com/images/upload/lirpa/auto_LiRPA_background_2.png" width="80%"></a>
</p>

Our `auto_LiRPA` library conducts perturbation analysis on a computational
graph, where the input data and model weights are defined within some
user-defined ranges.  We get guaranteed output ranges (bounds):

<p align="center">
<a href="http://PaperCode.cc/AutoLiRPA-Video"><img src="http://www.huan-zhang.com/images/upload/lirpa/auto_LiRPA_background_3.png" width="80%"></a>
</p>

## Installation

Python 3.11+ and PyTorch 2.0+ are required.
It is highly recommended to have a pre-installed PyTorch
that matches your system and our version requirement
(see [PyTorch Get Started](https://pytorch.org/get-started)).
Then you can install `auto_LiRPA` via:

```bash
git clone https://github.com/Verified-Intelligence/auto_LiRPA
cd auto_LiRPA
pip install .
```

If you intend to modify this library, use `pip install -e .` instead.

## Quick Start

First define your computation as a `nn.Module` and wrap it using
`auto_LiRPA.BoundedModule()`. Then, you can call the `compute_bounds` function
to obtain certified lower and upper bounds under input perturbations:

```python
from auto_LiRPA import BoundedModule, BoundedTensor, PerturbationLpNorm

# Define computation as a nn.Module.
class MyModel(nn.Module):
    def forward(self, x):
        # Define your computation here.

model = MyModel()
my_input = load_a_batch_of_data()
# Wrap the model with auto_LiRPA.
model = BoundedModule(model, my_input)
# Define perturbation. Here we add Linf perturbation to input data.
ptb = PerturbationLpNorm(norm=np.inf, eps=0.1)
# Make the input a BoundedTensor with the pre-defined perturbation.
my_input = BoundedTensor(my_input, ptb)
# Regular forward propagation using BoundedTensor works as usual.
prediction = model(my_input)
# Compute LiRPA bounds using the backward mode bound propagation (CROWN).
lb, ub = model.compute_bounds(x=(my_input,), method="backward")
```

Checkout
[examples/vision/simple_verification.py](examples/vision/simple_verification.py)
for a complete but very basic example.

<a href="http://PaperCode.cc/AutoLiRPA-Demo"><img align="left" width=64 height=64 src="https://colab.research.google.com/img/colab_favicon_256px.png"></a>
We also provide a [Google Colab Demo](http://PaperCode.cc/AutoLiRPA-Demo) including an example of computing verification
bounds for a 18-layer ResNet model on CIFAR-10 dataset. Once the ResNet model
is defined as usual in Pytorch, obtaining provable output bounds is as easy as
obtaining gradients through autodiff. Bounds are efficiently computed on GPUs.

## More Working Examples

We provide [a wide range of examples](doc/src/examples.md) of using `auto_LiRPA`:

* [Basic Bound Computation on a Toy Neural Network (simplest example)](examples/simple/toy.py)
* [Basic Bound Computation with **Robustness Verification** of Neural Networks as an example](doc/src/examples.md#basic-bound-computation-and-robustness-verification-of-neural-networks)
* [MIP/LP Formulation of Neural Networks](examples/simple/mip_lp_solver.py)
* [Basic **Certified Adversarial Defense** Training](doc/src/examples.md#basic-certified-adversarial-defense-training)
* [Large-scale Certified Defense Training on **ImageNet**](doc/src/examples.md#certified-adversarial-defense-on-downscaled-imagenet-and-tinyimagenet-with-loss-fusion)
* [Certified Adversarial Defense Training on Sequence Data with **LSTM**](doc/src/examples.md#certified-adversarial-defense-training-for-lstm-on-mnist)
* [Certifiably Robust Language Classifier using **Transformers**](doc/src/examples.md#certifiably-robust-language-classifier-with-transformer-and-lstm)
* [Certified Robustness against **Model Weight Perturbations**](doc/src/examples.md#certified-robustness-against-model-weight-perturbations-and-certified-defense)
* [Bounding **Jacobian** and **local Lipschitz constants**](examples/vision/jacobian.py)
* [Compute an Overapproximate of Neural Network **Preimage**](examples/simple/invprop.py)

`auto_LiRPA` has also been used in the following works:
* [**α,β-CROWN for complete neural network verification**](https://github.com/Verified-Intelligence/alpha-beta-CROWN)
* [**Fast certified robust training**](https://github.com/shizhouxing/Fast-Certified-Robust-Training)
* [**Computing local Lipschitz constants**](https://github.com/shizhouxing/Local-Lipschitz-Constants)

## Full Documentations

For more documentations, please refer to:

* [Documentation homepage](https://auto-lirpa.readthedocs.io)
* [API documentation](https://auto-lirpa.readthedocs.io/en/latest/api.html)
* [Adding custom operators](https://auto-lirpa.readthedocs.io/en/latest/custom_op.html)
* [Guide](https://auto-lirpa.readthedocs.io/en/latest/paper.html) for reproducing [our NeurIPS 2020 paper](https://arxiv.org/abs/2002.12920)

## Publications

Please kindly cite our papers if you use the `auto_LiRPA` library. Full [BibTeX entries](doc/src/examples.md#bibtex-entries) can be found [here](doc/src/examples.md#bibtex-entries).

The general LiRPA based bound propagation algorithm was originally proposed in our paper:

* [Automatic Perturbation Analysis for Scalable Certified Robustness and Beyond](https://arxiv.org/pdf/2002.12920).
NeurIPS 2020.
Kaidi Xu\*, Zhouxing Shi\*, Huan Zhang\*, Yihan Wang, Kai-Wei Chang, Minlie Huang, Bhavya Kailkhura, Xue Lin, Cho-Jui Hsieh (\* Equal contribution)

The `auto_LiRPA` library is further extended to support:

* Optimized bounds (α-CROWN):

  [Fast and Complete: Enabling Complete Neural Network Verification with Rapid and Massively Parallel Incomplete Verifiers](https://arxiv.org/pdf/2011.13824.pdf). ICLR 2021. Kaidi Xu\*, Huan Zhang\*, Shiqi Wang, Yihan Wang, Suman Jana, Xue Lin and Cho-Jui Hsieh (\* Equal contribution).

* Split constraints (β-CROWN):

  [Beta-CROWN: Efficient Bound Propagation with Per-neuron Split Constraints for Complete and Incomplete Neural Network Verification](https://arxiv.org/pdf/2103.06624.pdf). NeurIPS 2021. Shiqi Wang\*, Huan Zhang\*, Kaidi Xu\*, Suman Jana, Xue Lin, Cho-Jui Hsieh and Zico Kolter (\* Equal contribution).

* General constraints (GCP-CROWN):

  [GCP-CROWN: General Cutting Planes for Bound-Propagation-Based Neural Network Verification](https://arxiv.org/abs/2208.05740). Huan Zhang\*, Shiqi Wang\*, Kaidi Xu\*, Linyi Li, Bo Li, Suman Jana, Cho-Jui Hsieh and Zico Kolter (\* Equal contribution).

* Higher-order computational graphs (Lipschitz constants and Jacobian):

  [Efficiently Computing Local Lipschitz Constants of Neural Networks via Bound Propagation](https://arxiv.org/abs/2210.07394). NeurIPS 2022. Zhouxing Shi, Yihan Wang, Huan Zhang, Zico Kolter, Cho-Jui Hsieh.

* Branch-and-bound for non-ReLU and general nonlinear functions (GenBaB):

  [Neural Network Verification with Branch-and-Bound for General Nonlinearities](https://arxiv.org/pdf/2405.21063). TACAS 2025. Zhouxing Shi\*, Qirui Jin\*, Zico Kolter, Suman Jana, Cho-Jui Hsieh, Huan Zhang (\* Equal contribution).

* Tightening of bounds and preimage computation using the INVPROP algorithm:

  [Provably Bounding Neural Network Preimages](https://arxiv.org/pdf/2302.01404.pdf). NeurIPS 2023. Suhas Kotha\*, Christopher Brix\*, Zico Kolter, Krishnamurthy (Dj) Dvijotham\*\*, Huan Zhang\*\* (\* Equal contribution; \*\* Equal advising).

Certified training (verification-aware training by optimizing bounds) using `auto_LiRPA` is improved with:

* Much shorter warmup schedule and faster training:

  [Fast Certified Robust Training with Short Warmup](https://arxiv.org/pdf/2103.17268.pdf). NeurIPS 2021. Zhouxing Shi\*, Yihan Wang\*, Huan Zhang, Jinfeng Yi and Cho-Jui Hsieh (\* Equal contribution).

* Training-time branch-and-bound:

  [Certified Training with Branch-and-Bound: A Case Study on Lyapunov-stable Neural Control](https://arxiv.org/abs/2411.18235). Zhouxing Shi, Cho-Jui Hsieh, and Huan Zhang.


## Developers and Copyright

Team leaders:
* Faculty: Huan Zhang (huan@huan-zhang.com), UIUC
* Student: Xiangru Zhong (xiangru4@illinois.edu), UIUC

Current developers (* indicates members of VNN-COMP 2025 team):
* \*Duo Zhou (duozhou2@illinois.edu), UIUC
* \*Keyi Shen (keyis2@illinois.edu), UIUC (graduated, now at Georgia Tech)
* \*Hesun Chen (hesunc2@illinois.edu), UIUC
* \*Haoyu Li (haoyuli5@illinois.edu), UIUC
* \*Ruize Gao (ruizeg2@illinois.edu), UIUC
* \*Hao Cheng (haoc539@illinois.edu), UIUC
* Zhouxing Shi (zhouxingshichn@gmail.com), UCLA/UC Riverside
* Lei Huang (leih5@illinois.edu), UIUC
* Taobo Liao (taobol2@illinois.edu), UIUC
* Jorge Chavez (jorgejc2@illinois.edu), UIUC

Past developers:
* Hongji Xu (hx84@duke.edu), Duke University (intern with Prof. Huan Zhang)
* Christopher Brix (brix@cs.rwth-aachen.de), RWTH Aachen University
* Hao Chen (haoc8@illinois.edu), UIUC
* Keyu Lu (keyulu2@illinois.edu), UIUC
* Kaidi Xu (kx46@drexel.edu), Drexel University
* Sanil Chawla (schawla7@illinois.edu), UIUC
* Linyi Li (linyi2@illinois.edu), UIUC
* Zhuolin Yang (zhuolin5@illinois.edu), UIUC
* Zhuowen Yuan (realzhuowen@gmail.com), UIUC
* Qirui Jin (qiruijin@umich.edu), University of Michigan
* Shiqi Wang (sw3215@columbia.edu), Columbia University
* Yihan Wang (yihanwang@ucla.edu), UCLA
* Jinqi (Kathryn) Chen (jinqic@cs.cmu.edu), CMU

`auto_LiRPA` is currently supported in part by the National Science Foundation (NSF; award 2331967, 2525287), the AI2050 program at Schmidt Science, the Virtual Institute for Scientific Software (VISS) at Georgia Tech, the University Research Program at Toyota Research Institute (TRI), and a Mathworks research award.

We thank the [commits](https://github.com/Verified-Intelligence/auto_LiRPA/commits) and [pull requests](https://github.com/Verified-Intelligence/auto_LiRPA/pulls) from community contributors.

Our library is released under the BSD 3-Clause license.


================================================
FILE: auto_LiRPA/__init__.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
from .bound_general import BoundedModule
from .bound_multi_gpu import BoundDataParallel
from .bounded_tensor import BoundedTensor, BoundedParameter
from .perturbations import PerturbationLpNorm, PerturbationSynonym, PerturbationLinear
from .wrapper import CrossEntropyWrapper, CrossEntropyWrapperMultiInput
from .bound_op_map import register_custom_op, unregister_custom_op

__version__ = '0.7.0'


================================================
FILE: auto_LiRPA/backward_bound.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
import os
import torch
from torch import Tensor
from collections import deque
from tqdm import tqdm
from .patches import Patches
from .utils import *
from .bound_ops import *
import warnings

from typing import TYPE_CHECKING, List
if TYPE_CHECKING:
    from .bound_general import BoundedModule


def batched_backward(self: 'BoundedModule', node, C, unstable_idx, batch_size,
                     bound_lower=True, bound_upper=True, return_A=None):
    if return_A is None: return_A = self.return_A
    output_shape = node.output_shape[1:]
    dim = int(prod(output_shape))
    if unstable_idx is None:
        unstable_idx = torch.arange(dim, device=self.device)
        dense = True
    else:
        dense = False
    unstable_size = get_unstable_size(unstable_idx)
    print(f'Batched CROWN: node {node}, unstable size {unstable_size}')
    crown_batch_size = self.bound_opts['crown_batch_size']
    auto_batch_size = AutoBatchSize(self.bound_opts['crown_batch_size'], self.device, vram_ratio=self.bound_opts['batched_crown_max_vram_ratio'])

    ret = []
    ret_A = {} # if return_A, we will store A here
    i = 0
    torch.cuda.empty_cache()
    with tqdm(total=unstable_size) as pbar:
        while i < unstable_size:
            crown_batch_size = auto_batch_size.batch_size
            if isinstance(unstable_idx, tuple):
                unstable_idx_batch = tuple(
                    u[i : i + crown_batch_size]
                    for u in unstable_idx
                )
                unstable_size_batch = len(unstable_idx_batch[0])
            else:
                unstable_idx_batch = unstable_idx[i : i + crown_batch_size]
                unstable_size_batch = len(unstable_idx_batch)
            auto_batch_size.record_actual_batch_size(unstable_size_batch)

            if node.patches_start and node.mode == "patches":
                assert C is None or C.type == 'Patches'
                C_batch = Patches(shape=[
                    unstable_size_batch, batch_size, *node.output_shape[1:-2], 1, 1],
                    identity=1, unstable_idx=unstable_idx_batch,
                    output_shape=[batch_size, *node.output_shape[1:]])
            elif C.type == 'OneHot':
                assert isinstance(node, (BoundLinear, BoundMatMul))
                C_batch = OneHotC(
                    [batch_size, unstable_size_batch, *node.output_shape[1:]],
                    self.device, unstable_idx_batch, None)
            else:
                assert C is None or C.type == 'eye'
                C_batch = torch.zeros([1, unstable_size_batch, dim], device=self.device)
                C_batch[0, torch.arange(unstable_size_batch), unstable_idx_batch] = 1.0
                C_batch = C_batch.expand(batch_size, -1, -1).view(
                    batch_size, unstable_size_batch, *output_shape)
            # overwrite return_A options to run backward general
            ori_return_A_option = self.return_A
            self.return_A = return_A

            batch_ret = self.backward_general(
                node, C_batch,
                bound_lower=bound_lower, bound_upper=bound_upper,
                average_A=False, need_A_only=False, unstable_idx=unstable_idx_batch)
            ret.append(batch_ret[:2])

            if len(batch_ret) > 2:
                # A found, we merge A
                batch_A = batch_ret[2]
                ret_A = merge_A(node, batch_A, ret_A)

            # restore return_A options
            self.return_A = ori_return_A_option

            pbar.update(unstable_size_batch)
            i += unstable_size_batch
            auto_batch_size.update()

    if bound_lower:
        lb = torch.cat([item[0].view(batch_size, -1) for item in ret], dim=1)
        if dense:
            # In this case, restore_sparse_bounds will not be called.
            # And thus we restore the shape here.
            lb = lb.reshape(batch_size, *output_shape)
    else:
        lb = None
    if bound_upper:
        ub = torch.cat([item[1].view(batch_size, -1) for item in ret], dim=1)
        if dense:
            # In this case, restore_sparse_bounds will not be called.
            # And thus we restore the shape here.
            ub = ub.reshape(batch_size, *output_shape)
    else:
        ub = None

    if return_A:
        return lb, ub, ret_A
    else:
        return lb, ub


def backward_general(
    self: 'BoundedModule',
    bound_node,
    C,
    start_backpropagation_at_node = None,
    bound_lower=True,
    bound_upper=True,
    average_A=False,
    need_A_only=False,
    unstable_idx=None,
    update_mask=None,
    apply_output_constraints_to: Optional[List[str]] = None,
    initial_As: Optional[dict] = None,
    initial_lb: Optional[torch.tensor] = None,
    initial_ub: Optional[torch.tensor] = None,
):
    use_beta_crown = self.bound_opts['optimize_bound_args']['enable_beta_crown']
    tighten_input_bounds = (
        self.bound_opts['optimize_bound_args']['tighten_input_bounds']
    )

    if self.invprop_enabled():
        self.invprop_init_infeasible_bounds(bound_node, C)
    if bound_node.are_output_constraints_activated_for_layer(apply_output_constraints_to):
        return self.backward_general_invprop(
            initial_As=initial_As, initial_lb=initial_lb, initial_ub=initial_ub,
            bound_node=bound_node, C=C,
            start_backpropagation_at_node=start_backpropagation_at_node,
            bound_lower=bound_lower, bound_upper=bound_upper,
            average_A=average_A, need_A_only=need_A_only,
            unstable_idx=unstable_idx, update_mask=update_mask
        )

    roots = self.roots()

    if start_backpropagation_at_node is None:
        # When output constraints are used, backward_general_with_output_constraint()
        # adds additional layers at the end, performs the backpropagation through these,
        # and then calls backward_general() on the output layer.
        # In this case, the layer we start from (start_backpropagation_at_node) differs
        # from the layer that should be bounded (bound_node)

        # When output constraints are not used, the bounded node is the one where
        # backpropagation starts.
        start_backpropagation_at_node = bound_node

    if self.verbose:
        logger.debug(f'Bound backward from {start_backpropagation_at_node.__class__.__name__}({start_backpropagation_at_node.name}) '
                     f'to bound {bound_node.__class__.__name__}({bound_node.name})')
        if isinstance(C, BatchedCrownC):
            logger.debug(f'  C: {C}')
        elif C is not None:
            logger.debug(f'  C: shape {C.shape}, type {type(C)}')
    _print_time = bool(os.environ.get('AUTOLIRPA_PRINT_TIME', 0))

    if isinstance(C, BatchedCrownC):
        # If C is a str, use batched CROWN. If batched CROWN is not intended to
        # be enabled, C must be a explicitly provided non-str object for this function.
        if need_A_only or average_A:
            raise ValueError(
                'Batched CROWN is not compatible with '
                f'need_A_only={need_A_only}, average_A={average_A}')
        ret = self.batched_backward(
            bound_node, C, unstable_idx,
            batch_size=roots[0].value.shape[0],
            bound_lower=bound_lower, bound_upper=bound_upper,
        )
        bound_node.lower, bound_node.upper = ret[:2]
        return ret

    for n in self.nodes():
        n.lA = n.uA = None

    degree_out = get_degrees(start_backpropagation_at_node)
    C, batch_size, output_dim, output_shape = self._preprocess_C(C, bound_node)

    if initial_As is None:
        start_backpropagation_at_node.lA = C if bound_lower else None
        start_backpropagation_at_node.uA = C if bound_upper else None
    else:
        for layer_name, (lA, uA) in initial_As.items():
            self[layer_name].lA = lA
            self[layer_name].uA = uA
        assert start_backpropagation_at_node.lA is not None or start_backpropagation_at_node.uA is not None
    if initial_lb is None:
        lb = torch.tensor(0., device=self.device)
    else:
        lb = initial_lb
    if initial_ub is None:
        ub = torch.tensor(0., device=self.device)
    else:
        ub = initial_ub

    # Save intermediate layer A matrices when required.
    A_record = {}

    queue = deque([start_backpropagation_at_node])
    while len(queue) > 0:
        l = queue.popleft()  # backward from l

        if l.name in self.root_names:
            continue

        # if all the succeeds are done, then we can turn to this node in the
        # next iteration.
        for l_pre in l.inputs:
            degree_out[l_pre.name] -= 1
            if degree_out[l_pre.name] == 0:
                queue.append(l_pre)

        # Initially, l.lA or l.uA will be set to C for this node.
        if l.lA is not None or l.uA is not None:
            if self.verbose:
                logger.debug(f'  Bound backward to {l} (out shape {l.output_shape})')
                if l.lA is not None:
                    logger.debug('    lA type %s shape %s',
                                 type(l.lA), list(l.lA.shape))
                if l.uA is not None:
                    logger.debug('    uA type %s shape %s',
                                 type(l.uA), list(l.uA.shape))

            if _print_time:
                start_time = time.time()

            self.backward_from[l.name].append(bound_node)

            if not l.perturbed:
                if not hasattr(l, 'forward_value'):
                    self.get_forward_value(l)
                lb, ub = add_constant_node(lb, ub, l)
                continue

            if l.zero_uA_mtx and l.zero_lA_mtx:
                # A matrices are all zero, no need to propagate.
                continue

            lA, uA = l.lA, l.uA
            if (l.name != start_backpropagation_at_node.name and use_beta_crown
                    and getattr(l, 'sparse_betas', None)):
                lA, uA, lbias, ubias = self.beta_crown_backward_bound(
                    l, lA, uA, start_node=start_backpropagation_at_node)
                lb = lb + lbias
                ub = ub + ubias

            if isinstance(l, BoundOptimizableActivation):
                # For other optimizable activation functions (TODO: unify with ReLU).
                if bound_node.name != self.final_node_name:
                    start_shape = bound_node.output_shape[1:]
                else:
                    start_shape = C.shape[0]
                l.preserve_mask = update_mask
            else:
                start_shape = None
            A, lower_b, upper_b = l.bound_backward(
                lA, uA, *l.inputs,
                start_node=bound_node, unstable_idx=unstable_idx,
                start_shape=start_shape)

            # After propagation through this node, we delete its lA, uA variables.
            if bound_node.name != self.final_name:
                del l.lA, l.uA
            if _print_time:
                torch.cuda.synchronize()
                time_elapsed = time.time() - start_time
                if time_elapsed > 5e-3:
                    print(l, time_elapsed)
            if lb.ndim > 0 and type(lower_b) == Tensor and self.conv_mode == 'patches':
                lb, ub, lower_b, upper_b = check_patch_biases(lb, ub, lower_b, upper_b)
            lb = lb + lower_b
            ub = ub + upper_b
            if self.return_A and self.needed_A_dict and bound_node.name in self.needed_A_dict:
                # FIXME remove [0][0] and [0][1]?
                if len(self.needed_A_dict[bound_node.name]) == 0 or l.name in self.needed_A_dict[bound_node.name]:
                    # A could be either patches (in this case we cannot transpose so directly return)
                    # or matrix (in this case we transpose)
                    A_record.update({
                        l.name: {
                            "lA": (
                                A[0][0].detach() if isinstance(A[0][0], Patches)
                                else A[0][0].transpose(0, 1).detach()
                            ) if A[0][0] is not None else None,
                            "uA": (
                                A[0][1].detach() if isinstance(A[0][1], Patches)
                                else A[0][1].transpose(0, 1).detach()
                            ) if A[0][1] is not None else None,
                            # When not used, lb or ub is tensor(0).
                            "lbias": lb.transpose(0, 1).detach() if lb.ndim > 1 else None,
                            "ubias": ub.transpose(0, 1).detach() if ub.ndim > 1 else None,
                            "unstable_idx": unstable_idx
                    }})
                # FIXME: solve conflict with the following case
                self.A_dict.update({bound_node.name: A_record})
                if need_A_only and set(self.needed_A_dict[bound_node.name]) == set(A_record.keys()):
                    # We have collected all A matrices we need. We can return now!
                    self.A_dict.update({bound_node.name: A_record})
                    # Do not concretize to save time. We just need the A matrices.
                    # return A matrix as a dict: {node_start.name: [A_lower, A_upper]}
                    return None, None, self.A_dict

            for i, l_pre in enumerate(l.inputs):
                add_bound(l, l_pre, lA=A[i][0], uA=A[i][1])

    if lb.ndim >= 2:
        lb = lb.transpose(0, 1)
    if ub.ndim >= 2:
        ub = ub.transpose(0, 1)

    # TODO merge into `concretize`
    if (self.cut_used and getattr(self, 'cut_module', None) is not None
            and self.cut_module.x_coeffs is not None):
        # propagate input neuron in cut constraints
        roots[0].lA, roots[0].uA = self.cut_module.input_cut(
            bound_node, roots[0].lA, roots[0].uA, roots[0].lower.size()[1:], unstable_idx,
            batch_mask=update_mask)

    lb, ub = self.concretize_bounds(
        node=bound_node,
        lower=lb,
        upper=ub,
        concretize_mode='backward',
        batch_size=batch_size,
        output_dim=output_dim,
        average_A=average_A,
        clip_neuron_selection_value=self.clip_neuron_selection_value,
        clip_neuron_selection_type=self.clip_neuron_selection_type
    )

    if self.return_A and self.needed_A_dict and bound_node.name in self.needed_A_dict:
        save_root_A(
            bound_node, A_record, self.A_dict, roots,
            self.needed_A_dict[bound_node.name],
            lb=lb, ub=ub, unstable_idx=unstable_idx)
    for root in self.roots():
        # These are saved for `save_root_A`. We do not need them afterwards.
        root.lb = root.ub = None

    if tighten_input_bounds and isinstance(bound_node, BoundInput):
        shape = bound_node.perturbation.x_L.shape
        lb_reshaped = lb.reshape(shape)
        bound_node.perturbation.x_L = lb_reshaped - lb_reshaped.detach() + torch.max(bound_node.perturbation.x_L.detach(), lb_reshaped.detach())
        ub_reshaped = ub.reshape(shape)
        bound_node.perturbation.x_U = ub_reshaped - ub_reshaped.detach() + torch.min(bound_node.perturbation.x_U.detach(), ub_reshaped.detach())

    lb = lb.view(batch_size, *output_shape) if bound_lower else None
    ub = ub.view(batch_size, *output_shape) if bound_upper else None

    # TODO merge into `concretize`
    if (self.cut_used and getattr(self, "cut_module", None) is not None
            and self.cut_module.cut_bias is not None):
        # propagate cut bias in cut constraints
        lb, ub = self.cut_module.bias_cut(bound_node, lb, ub, unstable_idx, batch_mask=update_mask)
        if lb is not None and ub is not None and ((lb-ub)>0).sum().item() > 0:
            # make sure there is no bug for cut constraints propagation
            print(f"Warning: lb is larger than ub with diff: {(lb-ub)[(lb-ub)>0].max().item()}")

    if self.verbose:
        logger.debug('')

    if self.invprop_enabled():
        lb, ub = self.invprop_check_infeasible_bounds(lb, ub)

    if self.return_A:
        if self.bound_opts['clip_in_alpha_crown'] and self.final_name in self.A_dict.keys():
            for v in self.A_dict[self.final_name].values():
                if v["lA"] is not None:
                    self.constraints_optimized = (v["lA"], v["lbias"])
        return lb, ub, self.A_dict
    else:
        return lb, ub


def get_unstable_size(unstable_idx):
    if isinstance(unstable_idx, tuple):
        return unstable_idx[0].numel()
    else:
        return unstable_idx.numel()


def check_optimized_variable_sparsity(self: 'BoundedModule', node):
    alpha_sparsity = None  # unknown, optimizable variables are not created for this node.
    for relu in self.relus:
        # FIXME: this hardcoded for ReLUs. Need to support other optimized nonlinear functions.
        # alpha_lookup_idx is only created for sparse-spec alphas.
        if relu.alpha_lookup_idx is not None and node.name in relu.alpha_lookup_idx:
            if relu.alpha_lookup_idx[node.name] is not None:
                # This node was created with sparse alpha
                alpha_sparsity = True
            elif self.bound_opts['optimize_bound_args']['use_shared_alpha']:
                # Shared alpha, the spec dimension is 1, and sparsity can be supported.
                alpha_sparsity = True
            else:
                alpha_sparsity = False
            break
    return alpha_sparsity


def get_sparse_C(self: 'BoundedModule', node, ref_intermediate):
    (sparse_intermediate_bounds,
     ref_intermediate_lb, ref_intermediate_ub) = ref_intermediate
    sparse_conv_intermediate_bounds = self.bound_opts.get('sparse_conv_intermediate_bounds', False)
    minimum_sparsity = self.bound_opts.get('minimum_sparsity', 0.9)
    crown_batch_size = self.bound_opts.get('crown_batch_size', 1e9)
    dim = int(prod(node.output_shape[1:]))
    batch_size = self.batch_size

    reduced_dim = False  # Only partial neurons (unstable neurons) are bounded.
    unstable_idx = None
    unstable_size = np.inf
    newC = None

    alpha_is_sparse = self.check_optimized_variable_sparsity(node)

    # NOTE: batched CROWN is so far only supported for some of the cases below

    # FIXME: C matrix shape incorrect for BoundParams.
    if (isinstance(node, BoundLinear) or isinstance(node, BoundMatMul)) and int(
            os.environ.get('AUTOLIRPA_USE_FULL_C', 0)) == 0:
        if sparse_intermediate_bounds:
            # If we are doing bound refinement and reference bounds are given,
            # we only refine unstable neurons.
            # Also, if we are checking against LP solver we will refine all
            # neurons and do not use this optimization.
            # For each batch element, we find the unstable neurons.
            unstable_idx, unstable_size = self.get_unstable_locations(
                ref_intermediate_lb, ref_intermediate_ub)
            if unstable_size == 0:
                # Do nothing, no bounds will be computed.
                reduced_dim = True
                unstable_idx = []
            elif unstable_size > crown_batch_size:
                # Create C in batched CROWN
                newC = BatchedCrownC('OneHot')
                reduced_dim = True
            elif (((0 < unstable_size <= minimum_sparsity * dim and alpha_is_sparse is None) or
                   alpha_is_sparse) and
                   len(node.output_shape) <= 2):
                # When we already have sparse alpha for this layer, we always
                # use sparse C. Otherwise we determine it by sparsity.
                # Create an abstract C matrix, the unstable_idx are the non-zero
                # elements in specifications for all batches.
                # Shouldn't use OneHotC if the output is not a 1-d tensor.
                newC = OneHotC(
                    [batch_size, unstable_size, *node.output_shape[1:]],
                    self.device, unstable_idx, None)
                reduced_dim = True
            else:
                unstable_idx = None
                del ref_intermediate_lb, ref_intermediate_ub
        if not reduced_dim:
            if dim > crown_batch_size:
                newC = BatchedCrownC('eye')
            else:
                newC = eyeC([batch_size, dim, *node.output_shape[1:]], self.device)
    elif node.patches_start and node.mode == "patches":
        if sparse_intermediate_bounds:
            unstable_idx, unstable_size = self.get_unstable_locations(
                ref_intermediate_lb, ref_intermediate_ub, conv=True)
            if unstable_size == 0:
                # Do nothing, no bounds will be computed.
                reduced_dim = True
                unstable_idx = []
            elif unstable_size > crown_batch_size:
                # Create C in batched CROWN
                newC = BatchedCrownC('Patches')
                reduced_dim = True
            # We sum over the channel direction, so need to multiply that.
            elif (sparse_conv_intermediate_bounds
                  and unstable_size <= minimum_sparsity * dim
                  and alpha_is_sparse is None) or alpha_is_sparse:
                # When we already have sparse alpha for this layer, we always
                # use sparse C. Otherwise we determine it by sparsity.
                # Create an abstract C matrix, the unstable_idx are the non-zero
                # elements in specifications for all batches.
                # The shape of patches is [unstable_size, batch, C, H, W].
                newC = Patches(
                    shape=[unstable_size, batch_size, *node.output_shape[1:-2],
                           1, 1],
                    identity=1, unstable_idx=unstable_idx,
                    output_shape=[batch_size, *node.output_shape[1:]])
                reduced_dim = True
            else:
                unstable_idx = None
                del ref_intermediate_lb, ref_intermediate_ub
        # Here we create an Identity Patches object
        if not reduced_dim:
            newC = Patches(
                None, 1, 0, [node.output_shape[1], batch_size, *node.output_shape[2:],
                *node.output_shape[1:-2], 1, 1], 1,
                output_shape=[batch_size, *node.output_shape[1:]])
    elif (isinstance(node, (BoundAdd, BoundSub)) and node.mode == "patches"
        and len(node.output_shape) >= 4):
        # FIXME: BoundAdd does not always have patches. Need to use a better way
        # to determine patches mode.
        # FIXME: We should not hardcode BoundAdd here!
        if sparse_intermediate_bounds:
            if crown_batch_size < 1e9:
                warnings.warn('Batched CROWN is not supported in this case')
            unstable_idx, unstable_size = self.get_unstable_locations(
                ref_intermediate_lb, ref_intermediate_ub, conv=True)
            if unstable_size == 0:
                # Do nothing, no bounds will be computed.
                reduced_dim = True
                unstable_idx = []
            elif (sparse_conv_intermediate_bounds
                  and unstable_size <= minimum_sparsity * dim
                  and alpha_is_sparse is None) or alpha_is_sparse:
                # When we already have sparse alpha for this layer, we always
                # use sparse C. Otherwise we determine it by sparsity.
                num_channel = node.output_shape[-3]
                # Identity patch size: (ouc_c, 1, 1, 1, out_c, 1, 1).
                patches = (
                    torch.eye(num_channel, device=self.device,
                    dtype=list(self.parameters())[0].dtype)).view(
                        num_channel, 1, 1, 1, num_channel, 1, 1)
                # Expand to (out_c, 1, unstable_size, out_c, 1, 1).
                patches = patches.expand(-1, 1, node.output_shape[-2],
                                         node.output_shape[-1], -1, 1, 1)
                patches = patches[unstable_idx[0], :,
                                  unstable_idx[1], unstable_idx[2]]
                # Expand with the batch dimension. Final shape
                # (unstable_size, batch_size, out_c, 1, 1).
                patches = patches.expand(-1, batch_size, -1, -1, -1)
                newC = Patches(
                    patches, 1, 0, patches.shape, unstable_idx=unstable_idx,
                    output_shape=[batch_size, *node.output_shape[1:]])
                reduced_dim = True
            else:
                unstable_idx = None
                del ref_intermediate_lb, ref_intermediate_ub
        if not reduced_dim:
            num_channel = node.output_shape[-3]
            # Identity patch size: (ouc_c, 1, 1, 1, out_c, 1, 1).
            patches = (
                torch.eye(num_channel, device=self.device,
                dtype=list(self.parameters())[0].dtype)).view(
                    num_channel, 1, 1, 1, num_channel, 1, 1)
            # Expand to (out_c, batch, out_h, out_w, out_c, 1, 1).
            patches = patches.expand(-1, batch_size, node.output_shape[-2],
                                     node.output_shape[-1], -1, 1, 1)
            newC = Patches(patches, 1, 0, patches.shape, output_shape=[
                batch_size, *node.output_shape[1:]])
    else:
        if sparse_intermediate_bounds:
            unstable_idx, unstable_size = self.get_unstable_locations(
                ref_intermediate_lb, ref_intermediate_ub)
            if unstable_size == 0:
                # Do nothing, no bounds will be computed.
                reduced_dim = True
                unstable_idx = []
            elif unstable_size > crown_batch_size:
                # Create in C in batched CROWN
                newC = BatchedCrownC('eye')
                reduced_dim = True
            elif (unstable_size <= minimum_sparsity * dim
                  and alpha_is_sparse is None) or alpha_is_sparse:
                newC = torch.zeros([1, unstable_size, dim], device=self.device)
                # Fill the corresponding elements to 1.0
                newC[0, torch.arange(unstable_size), unstable_idx] = 1.0
                newC = newC.expand(batch_size, -1, -1).view(
                    batch_size, unstable_size, *node.output_shape[1:])
                reduced_dim = True
            else:
                unstable_idx = None
                del ref_intermediate_lb, ref_intermediate_ub
        if not reduced_dim:
            if dim > 1000:
                warnings.warn(
                    f"Creating an identity matrix with size {dim}x{dim} for node {node}. "
                    "This may indicate poor performance for bound computation. "
                    "If you see this message on a small network please submit "
                    "a bug report.", stacklevel=2)
            if dim > crown_batch_size:
                newC = BatchedCrownC('eye')
            else:
                newC = torch.eye(dim, device=self.device).unsqueeze(0).expand(
                    batch_size, -1, -1
                ).view(batch_size, dim, *node.output_shape[1:])

    return newC, reduced_dim, unstable_idx, unstable_size


def restore_sparse_bounds(self: 'BoundedModule', node, unstable_idx,
                          unstable_size, ref_intermediate,
                          new_lower=None, new_upper=None):
    ref_intermediate_lb, ref_intermediate_ub = ref_intermediate[1:]
    batch_size = self.batch_size
    if unstable_size == 0:
        # No unstable neurons. Skip the update.
        node.lower = ref_intermediate_lb.detach().clone()
        node.upper = ref_intermediate_ub.detach().clone()
    else:
        if new_lower is None:
            new_lower = node.lower
        if new_upper is None:
            new_upper = node.upper
        # If we only calculated unstable neurons, we need to scatter the results back based on reference bounds.
        if isinstance(unstable_idx, tuple):
            lower = ref_intermediate_lb.detach().clone()
            upper = ref_intermediate_ub.detach().clone()
            # Conv layer with patches, the unstable_idx is a 3-element tuple for 3 indices (C, H,W) of unstable neurons.
            if len(unstable_idx) == 3:
                lower[:, unstable_idx[0], unstable_idx[1], unstable_idx[2]] = new_lower
                upper[:, unstable_idx[0], unstable_idx[1], unstable_idx[2]] = new_upper
            elif len(unstable_idx) == 4:
                lower[:, unstable_idx[0], unstable_idx[1], unstable_idx[2], unstable_idx[3]] = new_lower
                upper[:, unstable_idx[0], unstable_idx[1], unstable_idx[2], unstable_idx[3]] = new_upper
        else:
            # Other layers.
            lower = ref_intermediate_lb.detach().clone().reshape(batch_size, -1)
            upper = ref_intermediate_ub.detach().clone().reshape(batch_size, -1)
            lower[:, unstable_idx] = new_lower.view(batch_size, -1)
            upper[:, unstable_idx] = new_upper.view(batch_size, -1)
        node.lower = lower.view(batch_size, *node.output_shape[1:])
        node.upper = upper.view(batch_size, *node.output_shape[1:])


def get_degrees(node_start):
    if not isinstance(node_start, list):
        node_start = [node_start]
    degrees = {}
    added = {}
    queue = deque()
    for node in node_start:
        queue.append(node)
        added[node.name] = True
    while len(queue) > 0:
        l = queue.popleft()
        for l_pre in l.inputs:
            degrees[l_pre.name] = degrees.get(l_pre.name, 0) + 1
            if not added.get(l_pre.name, False):
                queue.append(l_pre)
                added[l_pre.name] = True
    return degrees


def _preprocess_C(self: 'BoundedModule', C, node):
    if isinstance(C, Patches):
        if C.unstable_idx is None:
            # Patches have size (out_c, batch, out_h, out_w, c, h, w).
            if len(C.shape) == 7:
                out_c, batch_size, out_h, out_w = C.shape[:4]
                output_dim = out_c * out_h * out_w
            else:
                out_dim, batch_size, out_c, out_h, out_w = C.shape[:5]
                output_dim = out_dim * out_c * out_h * out_w
        else:
            # Patches have size (unstable_size, batch, c, h, w).
            output_dim, batch_size = C.shape[:2]
    else:
        batch_size, output_dim = C.shape[:2]

    # The C matrix specified by the user has shape (batch, spec)
    # but internally we have (spec, batch) format.
    if not isinstance(C, (eyeC, Patches, OneHotC)):
        C = C.transpose(0, 1).reshape(
            output_dim, batch_size, *node.output_shape[1:])
    elif isinstance(C, eyeC):
        C = C._replace(shape=(C.shape[1], C.shape[0], *C.shape[2:]))
    elif isinstance(C, OneHotC):
        C = C._replace(
            shape=(C.shape[1], C.shape[0], *C.shape[2:]),
            index=C.index.transpose(0,-1),
            coeffs=None if C.coeffs is None else C.coeffs.transpose(0,-1))

    if isinstance(C, Patches) and C.unstable_idx is not None:
        # Sparse patches; the output shape is (unstable_size, ).
        output_shape = [C.shape[0]]
    elif prod(node.output_shape[1:]) != output_dim and not isinstance(C, Patches):
        # For the output node, the shape of the bound follows C
        # instead of the original output shape
        #
        # TODO Maybe don't set node.lower and node.upper in this case?
        # Currently some codes still depend on node.lower and node.upper
        output_shape = [-1]
    else:
        # Generally, the shape of the bounds match the output shape of the node
        output_shape = node.output_shape[1:]

    return C, batch_size, output_dim, output_shape


def addA(A1, A2):
    """ Add two A (each of them is either Tensor or Patches) """
    if type(A1) == type(A2):
        return A1 + A2
    elif type(A1) == Patches:
        return A1 + A2
    elif type(A2) == Patches:
        return A2 + A1
    else:
        raise NotImplementedError(f'Unsupported types for A1 ({type(A1)}) and A2 ({type(A2)}')


def add_bound(node, node_pre, lA=None, uA=None):
    """
    Propagate lA and uA to a preceding node.
    @param node:        The current bounded node
    @param node_pre:    An input of the current bounded node that needs lA, lbias ,etc. back propagated to it
    @param lA:          lA matrix associated with the current bounded node
    @param uA:          uA matrix associated with the current bounded node
    @return:
    """

    if lA is not None:
        if node_pre.lA is None:
            # First A added to this node.
            node_pre.zero_lA_mtx = node.zero_backward_coeffs_l
            node_pre.lA = lA
        else:
            node_pre.zero_lA_mtx = node_pre.zero_lA_mtx and node.zero_backward_coeffs_l
            new_node_lA = addA(node_pre.lA, lA)
            node_pre.lA = new_node_lA
    if uA is not None:
        if node_pre.uA is None:
            # First A added to this node.
            node_pre.zero_uA_mtx = node_pre.zero_backward_coeffs_u
            node_pre.uA = uA
        else:
            node_pre.zero_uA_mtx = node_pre.zero_uA_mtx and node.zero_backward_coeffs_u
            node_pre.uA = addA(node_pre.uA, uA)


def add_constant_node(lb, ub, node):
    new_lb = node.get_bias(node.lA, node.forward_value)
    new_ub = node.get_bias(node.uA, node.forward_value)
    if isinstance(lb, Tensor) and isinstance(new_lb, Tensor) and lb.ndim > 0 and lb.ndim != new_lb.ndim:
        new_lb = new_lb.reshape(lb.shape)
    if isinstance(ub, Tensor) and isinstance(new_ub, Tensor) and ub.ndim > 0 and ub.ndim != new_ub.ndim:
        new_ub = new_ub.reshape(ub.shape)
    lb = lb + new_lb # FIXME (09/16): shape for the bias of BoundConstant.
    ub = ub + new_ub
    return lb, ub


def save_root_A(node, A_record, A_dict, roots, needed_A_dict, lb, ub,
                unstable_idx):
    root_A_record = {}
    for i in range(len(roots)):
        if roots[i].lA is None and roots[i].uA is None:
            continue
        if roots[i].name in needed_A_dict:
            if roots[i].lA is not None:
                if isinstance(roots[i].lA, Patches):
                    _lA = roots[i].lA.detach()
                else:
                    _lA = roots[i].lA.transpose(0, 1).detach()
            else:
                _lA = None
            if roots[i].uA is not None:
                if isinstance(roots[i].uA, Patches):
                    _uA = roots[i].uA.detach()
                else:
                    _uA = roots[i].uA.transpose(0, 1).detach()
            else:
                _uA = None

            # Include all the bias terms except the one concretized from the
            # current root node.
            lb_ = lb - roots[i].lb if (roots[i].lb is not None) else lb
            ub_ = ub - roots[i].ub if (roots[i].ub is not None) else ub

            root_A_record.update({roots[i].name: {
                "lA": _lA,
                "uA": _uA,
                # When not used, lb or ub is tensor(0). They have been transposed above.
                "lbias": lb_.detach() if lb_.ndim > 1 else None,
                "ubias": ub_.detach() if ub_.ndim > 1 else None,
                "unstable_idx": unstable_idx
            }})

    root_A_record.update(A_record)  # merge to existing A_record
    A_dict.update({node.name: root_A_record})


def select_unstable_idx(ref_intermediate_lb, ref_intermediate_ub, unstable_locs, max_crown_size):
    """When there are too many unstable neurons, only bound those
    with the loosest reference bounds."""
    gap = (
        ref_intermediate_ub[:, unstable_locs]
        - ref_intermediate_lb[:, unstable_locs]).sum(dim=0)
    indices = torch.argsort(gap, descending=True)
    indices_selected = indices[:max_crown_size]
    indices_selected, _ = torch.sort(indices_selected)
    print(f'{len(indices_selected)}/{len(indices)} unstable neurons selected for CROWN')
    return indices_selected


def get_unstable_locations(self: 'BoundedModule', ref_intermediate_lb,
                           ref_intermediate_ub, conv=False, channel_only=False):
    # FIXME (2023): This function should be a member class of the Bound object, since the
    # definition of unstable neurons depends on the activation function.
    max_crown_size = self.bound_opts.get('max_crown_size', int(1e9))
    # For conv layer we only check the case where all neurons are active/inactive.
    unstable_masks = torch.logical_and(ref_intermediate_lb < 0, ref_intermediate_ub > 0)
    # For simplicity, merge unstable locations for all elements in this batch. TODO: use individual unstable mask.
    # It has shape (H, W) indicating if a neuron is unstable/stable.
    # TODO: so far we merge over the batch dimension to allow easier implementation.
    if channel_only:
        # Only keep channels with unstable neurons. Used for initializing alpha.
        unstable_locs = unstable_masks.sum(dim=(0,2,3)).bool()
        # Shape is consistent with linear layers: a list of unstable neuron channels (no batch dim).
        unstable_idx = unstable_locs.nonzero().squeeze(1)
    else:
        if not conv and unstable_masks.ndim > 2:
            # Flatten the conv layer shape.
            unstable_masks = unstable_masks.reshape(unstable_masks.size(0), -1)
            ref_intermediate_lb = ref_intermediate_lb.reshape(ref_intermediate_lb.size(0), -1)
            ref_intermediate_ub = ref_intermediate_ub.reshape(ref_intermediate_ub.size(0), -1)
        unstable_locs = unstable_masks.sum(dim=0).bool()
        if conv:
            # Now converting it to indices for these unstable nuerons.
            # These are locations (i,j) of unstable neurons.
            unstable_idx = unstable_locs.nonzero(as_tuple=True)
        else:
            unstable_idx = unstable_locs.nonzero().squeeze(1)

    unstable_size = get_unstable_size(unstable_idx)
    if unstable_size > max_crown_size:
        indices_seleted = select_unstable_idx(
            ref_intermediate_lb, ref_intermediate_ub, unstable_locs, max_crown_size)
        if isinstance(unstable_idx, tuple):
            unstable_idx = tuple(u[indices_seleted] for u in unstable_idx)
        else:
            unstable_idx = unstable_idx[indices_seleted]
    unstable_size = get_unstable_size(unstable_idx)

    return unstable_idx, unstable_size


def get_alpha_crown_start_nodes(
        self: 'BoundedModule',
        node,
        c=None,
        share_alphas=False,
        final_node_name=None,
    ):
    """
    Given a layer "node", return a list of following nodes after this node whose bounds
    will propagate through this node. Each element in the list is a tuple with 3 elements:
    (following_node_name, following_node_shape, unstable_idx)
    """
    # When use_full_conv_alpha is True, conv layers do not share alpha.
    sparse_intermediate_bounds = self.bound_opts.get('sparse_intermediate_bounds', False)
    use_full_conv_alpha_thresh = self.bound_opts.get('use_full_conv_alpha_thresh', 512)

    start_nodes = []

    for nj in self.backward_from[node.name]:  # Pre-activation layers.
        unstable_idx = None
        use_sparse_conv = None  # Whether a sparse-spec alpha is used for a conv output node. None for non-conv output node.
        use_full_conv_alpha = self.bound_opts.get('use_full_conv_alpha', False)

        # Find the indices of unstable neuron, used for create sparse-feature alpha.
        if (sparse_intermediate_bounds
                and isinstance(node, BoundOptimizableActivation)
                and nj.name != final_node_name and not share_alphas):
            # Create sparse optimization variables for intermediate neurons.
            # These are called "sparse-spec" alpha because we only create alpha only for
            # the intermediate of final output nodes whose bounds are needed.
            # "sparse-spec" alpha makes sense only for piece-wise linear functions.
            # For other intermediate nodes, there is no "unstable" or "stable" neuron.
            # FIXME: whether an layer has unstable/stable neurons should be in Bound obj.
            # FIXME: get_unstable_locations should be a member class of ReLU.
            if len(nj.output_name) == 1 and isinstance(self[nj.output_name[0]], (BoundRelu, BoundSignMerge, BoundMaxPool)):
                if ((isinstance(nj, (BoundLinear, BoundMatMul)))
                        and int(os.environ.get('AUTOLIRPA_USE_FULL_C', 0)) == 0):
                    # unstable_idx has shape [neuron_size_of_nj]. Batch dimension is reduced.
                    unstable_idx, _ = self.get_unstable_locations(nj.lower, nj.upper)
                elif isinstance(nj, (BoundConv, BoundAdd, BoundSub, BoundBatchNormalization)) and nj.mode == 'patches':
                    if nj.name in node.patch_size:
                        # unstable_idx has shape [channel_size_of_nj]. Batch and spatial dimensions are reduced.
                        unstable_idx, _ = self.get_unstable_locations(
                            nj.lower, nj.upper, channel_only=not use_full_conv_alpha, conv=True)
                        use_sparse_conv = False  # alpha is shared among channels. Sparse-spec alpha in hw dimension not used.
                        if use_full_conv_alpha and unstable_idx[0].size(0) > use_full_conv_alpha_thresh:
                            # Too many unstable neurons. Using shared alpha per channel.
                            unstable_idx, _ = self.get_unstable_locations(
                                nj.lower, nj.upper, channel_only=True, conv=True)
                            use_full_conv_alpha = False
                    else:
                        # Matrix mode for conv layers. Although the bound propagation started with patches mode,
                        # when A matrix is propagated to this layer, it might become a dense matrix since patches
                        # can be come very large after many layers. In this case,
                        # unstable_idx has shape [c_out * h_out * w_out]. Batch dimension is reduced.
                        unstable_idx, _ = self.get_unstable_locations(nj.lower, nj.upper)
                        use_sparse_conv = True  # alpha is not shared among channels, and is sparse in spec dimension.
            else:
                # FIXME: we should not check for fixed names here. Need to enable patches mode more generally.
                if isinstance(nj, (BoundConv, BoundAdd, BoundSub, BoundBatchNormalization)) and nj.mode == 'patches':
                    use_sparse_conv = False  # Sparse-spec alpha can never be used, because it is not a ReLU activation.

        if nj.name == final_node_name:
            # Final layer, always the number of specs as the shape.
            size_final = self[final_node_name].output_shape[1:] if c is None else c.size(1)
            # The 4-th element indicates that this start node is the final node,
            # which may be utilized by operators that do not know the name of
            # the final node.
            start_nodes.append((final_node_name, size_final, None, True))
            continue

        if share_alphas:
            # all intermediate neurons from the same layer share the same set of alphas.
            output_shape = 1
        elif isinstance(node, BoundOptimizableActivation) and node.patch_size and nj.name in node.patch_size:
            # Patches mode. Use output channel size as the spec size. This still shares some alpha, but better than no sharing.
            if use_full_conv_alpha:
                # alphas not shared among channels, so the spec dim shape is c,h,w
                # The patch size is [out_ch, batch, out_h, out_w, in_ch, H, W]. We use out_ch as the output shape.
                output_shape = node.patch_size[nj.name][0], node.patch_size[nj.name][2], node.patch_size[nj.name][3]
            else:
                # The spec dim is c only, and is shared among h, w.
                output_shape = node.patch_size[nj.name][0]
            assert not sparse_intermediate_bounds or use_sparse_conv is False  # Double check our assumption holds. If this fails, then we created wrong shapes for alpha.
        else:
            # Output is linear layer (use_sparse_conv = None), or patch converted to matrix (use_sparse_conv = True).
            assert not sparse_intermediate_bounds or use_sparse_conv is not False  # Double check our assumption holds. If this fails, then we created wrong shapes for alpha.
            output_shape = nj.lower.shape[1:]  # FIXME: for non-relu activations it's still expecting a prod.
        start_nodes.append((nj.name, output_shape, unstable_idx, False))

    return start_nodes


def merge_A(node, batch_A, ret_A):
    for key0 in batch_A:
        if key0 not in ret_A: ret_A[key0] = {}
        for key1 in batch_A[key0]:
            value = batch_A[key0][key1]
            if key1 not in ret_A[key0]:
                # create:
                ret_A[key0].update({
                    key1: {
                        "lA": value["lA"],
                        "uA": value["uA"],
                        "lbias": value["lbias"],
                        "ubias": value["ubias"],
                        "unstable_idx": value["unstable_idx"]
                    }
                })
            elif key0 == node.name:
                # merge:
                # the batch splitting only happens for current node, i.e.,
                # for other nodes the returned lA should be the same across different batches
                # so no need to repeatly merge them
                exist = ret_A[key0][key1]

                if exist["unstable_idx"] is not None:
                    if isinstance(exist["unstable_idx"], torch.Tensor):
                        merged_unstable = torch.cat([
                            exist["unstable_idx"],
                            value['unstable_idx']], dim=0)
                    elif isinstance(exist["unstable_idx"], tuple):
                        if exist["unstable_idx"]:
                            merged_unstable = tuple([
                                torch.cat([exist["unstable_idx"][idx],
                                           value['unstable_idx'][idx]], dim=0)
                                for idx in range(len(exist['unstable_idx']))]
                            )
                        else:
                            merged_unstable = None
                    else:
                        raise NotImplementedError(
                            f'Unsupported type {type(exist["unstable_idx"])}')
                else:
                    merged_unstable = None
                merge_dict = {"unstable_idx": merged_unstable}
                for name in ["lA", "uA"]:
                    if exist[name] is not None:
                        if isinstance(exist[name], torch.Tensor):
                            # for matrix the spec dim is 1
                            merge_dict[name] = torch.cat([exist[name], value[name]], dim=1)
                        else:
                            assert isinstance(exist[name], Patches)
                            # for patches the spec dim`is 0
                            merge_dict[name] = exist[name].create_similar(
                                torch.cat([exist[name].patches, value[name].patches], dim=0),
                                unstable_idx=merged_unstable
                            )
                    else:
                        merge_dict[name] = None
                for name in ["lbias", "ubias"]:
                    if exist[name] is not None:
                        # for bias the spec dim in 1
                        merge_dict[name] = torch.cat([exist[name], value[name]], dim=1)
                    else:
                        merge_dict[name] = None
                ret_A[key0][key1] = merge_dict
    return ret_A


================================================
FILE: auto_LiRPA/beta_crown.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
from collections import OrderedDict
import numpy as np
import torch
from torch import Tensor
from .patches import Patches, inplace_unfold

from typing import TYPE_CHECKING
if TYPE_CHECKING:
    from .bound_general import BoundedModule


class SparseBeta:
    def __init__(self, shape, bias=False, betas=None, device='cpu'):
        self.device = device
        self.val = torch.zeros(shape)
        self.loc = torch.zeros(shape, dtype=torch.long, device=device)
        self.sign = torch.zeros(shape, device=device)
        self.bias = torch.zeros(shape, device=device) if bias else None
        if betas:
            for bi in range(len(betas)):
                if betas[bi] is not None:
                    self.val[bi, :len(betas[bi])] = betas[bi]
        self.val = self.val.detach().to(
            device, non_blocking=True).requires_grad_()

    def apply_splits(self, history, key):
        loc_numpy = np.zeros(self.loc.shape, dtype=np.int32)
        sign_numpy = np.zeros(self.sign.shape)
        if self.bias is not None:
            bias_numpy = np.zeros(self.bias.shape)
        for bi in range(len(history)):
            # Add history splits. (layer, neuron) is the current decision.
            split_locs, split_coeffs = history[bi][key][:2]
            split_len = len(split_locs)
            if split_len > 0:
                sign_numpy[bi, :split_len] = split_coeffs
                loc_numpy[bi, :split_len] = split_locs
                if self.bias is not None:
                    split_bias = history[bi][key][2]
                    bias_numpy[bi, :split_len] = split_bias
        self.loc.copy_(torch.from_numpy(loc_numpy), non_blocking=True)
        self.sign.copy_(torch.from_numpy(sign_numpy), non_blocking=True)
        if self.bias is not None:
            self.bias.copy_(torch.from_numpy(bias_numpy), non_blocking=True)

def get_split_nodes(self: 'BoundedModule'):
    self.split_nodes = []
    self.split_activations = {}
    splittable_activations = self.get_splittable_activations()
    self._set_used_nodes(self[self.final_name])
    for layer in self.layers_requiring_bounds:
        split_activations_ = []
        for activation_name in layer.output_name:
            activation = self[activation_name]
            if activation in splittable_activations:
                split_activations_.append(
                    (activation, activation.inputs.index(layer)))
        if split_activations_:
            if layer.lower is None and layer.upper is None:
                continue
            self.split_nodes.append(layer)
            self.split_activations[layer.name] = split_activations_
    return self.split_nodes, self.split_activations


def set_beta(self: 'BoundedModule', enable_opt_interm_bounds, parameters,
             lr_beta, lr_cut_beta, cutter, dense_coeffs_mask):
    """
    Set betas, best_betas, coeffs, dense_coeffs_mask, best_coeffs, biases
    and best_biases.
    """
    coeffs = None
    betas = []
    best_betas = OrderedDict()

    # TODO compute only once
    self.nodes_with_beta = []
    for node in self.split_nodes:
        if not hasattr(node, 'sparse_betas'):
            continue
        self.nodes_with_beta.append(node)
        if enable_opt_interm_bounds:
            for sparse_beta in node.sparse_betas.values():
                if sparse_beta is not None:
                    betas.append(sparse_beta.val)
            best_betas[node.name] = {
                beta_m: sparse_beta.val.detach().clone()
                for beta_m, sparse_beta in node.sparse_betas.items()
            }
        else:
            betas.append(node.sparse_betas[0].val)
            best_betas[node.name] = node.sparse_betas[0].val.detach().clone()

    # Beta has shape (batch, max_splits_per_layer)
    parameters.append({
        'params': [item for item in betas if item.numel() > 0],
        'lr': lr_beta, 'batch_dim': 0})

    if self.cut_used:
        self.set_beta_cuts(parameters, lr_cut_beta, betas, best_betas, cutter)

    return betas, best_betas, coeffs, dense_coeffs_mask


def set_beta_cuts(self: 'BoundedModule', parameters, lr_cut_beta, betas,
                  best_betas, cutter):
    # also need to optimize cut betas
    parameters.append({'params': self.cut_beta_params,
                        'lr': lr_cut_beta, 'batch_dim': 0})
    betas += self.cut_beta_params
    best_betas['cut'] = [beta.detach().clone() for beta in self.cut_beta_params]
    if getattr(cutter, 'opt', False):
        parameters.append(cutter.get_parameters())


def reset_beta(self: 'BoundedModule', node, shape, betas, bias=False,
               start_nodes=None):
    # Create only the non-zero beta. For each layer, it is padded to maximal length.
    # We create tensors on CPU first, and they will be transferred to GPU after initialized.
    if self.bound_opts.get('enable_opt_interm_bounds', False):
        node.sparse_betas = {
            key: SparseBeta(
                shape,
                betas=[(betas[j][i] if betas[j] is not None else None)
                        for j in range(len(betas))],
                device=self.device, bias=bias,
            ) for i, key in enumerate(start_nodes)
        }
    else:
        node.sparse_betas = [SparseBeta(
            shape, betas=betas, device=self.device, bias=bias)]


def beta_crown_backward_bound(self: 'BoundedModule', node, lA, uA, start_node=None):
    """Update A and bias with Beta-CROWN.

    Must be explicitly called at the end of "bound_backward".
    """
    # Regular Beta CROWN with single neuron split
    # Each split constraint only has single neuron (e.g., second ReLU neuron > 0).
    A = lA if lA is not None else uA
    lbias = ubias = 0

    def _bias_unsupported():
        raise NotImplementedError('Bias for beta not supported in this case.')

    if type(A) is Patches:
        if not self.bound_opts.get('enable_opt_interm_bounds', False):
            raise NotImplementedError('Sparse beta not supported in the patches mode')
        if node.sparse_betas[start_node.name].bias is not None:
            _bias_unsupported()
        # expand sparse_beta to full beta
        beta_values = (node.sparse_betas[start_node.name].val
                       * node.sparse_betas[start_node.name].sign)
        beta_indices = node.sparse_betas[start_node.name].loc
        node.masked_beta = torch.zeros(2, *node.shape).reshape(2, -1).to(A.patches.dtype)
        node.non_deter_scatter_add(
            node.masked_beta, dim=1, index=beta_indices,
            src=beta_values.to(node.masked_beta.dtype))
        node.masked_beta = node.masked_beta.reshape(2, *node.shape)
        # unfold the beta as patches, size (batch, out_h, out_w, in_c, H, W)
        A_patches = A.patches
        masked_beta_unfolded = inplace_unfold(
            node.masked_beta, kernel_size=A_patches.shape[-2:],
            padding=A.padding, stride=A.stride,
            inserted_zeros=A.inserted_zeros, output_padding=A.output_padding)
        if A.unstable_idx is not None:
            masked_beta_unfolded = masked_beta_unfolded.permute(1, 2, 0, 3, 4, 5)
            # After selection, the shape is (unstable_size, batch, in_c, H, W).
            masked_beta_unfolded = masked_beta_unfolded[A.unstable_idx[1], A.unstable_idx[2]]
        else:
            # Add the spec (out_c) dimension.
            masked_beta_unfolded = masked_beta_unfolded.unsqueeze(0)
        if node.alpha_beta_update_mask is not None:
            masked_beta_unfolded = masked_beta_unfolded[node.alpha_beta_update_mask]
        if uA is not None:
            uA = uA.create_similar(uA.patches + masked_beta_unfolded)
        if lA is not None:
            lA = lA.create_similar(lA.patches - masked_beta_unfolded)
    elif type(A) is Tensor:
        if self.bound_opts.get('enable_opt_interm_bounds', False):
            if node.sparse_betas[start_node.name].bias is not None:
                _bias_unsupported()
            # For matrix mode, beta is sparse.
            beta_values = (
                node.sparse_betas[start_node.name].val
                * node.sparse_betas[start_node.name].sign
            ).expand(A.size(0), -1, -1)
            # node.single_beta_loc has shape [batch, max_single_split].
            # Need to expand at the specs dimension.
            beta_indices = (node.sparse_betas[start_node.name].loc
                            .unsqueeze(0).expand(A.size(0), -1, -1))
            beta_bias = node.sparse_betas[start_node.name].bias
        else:
            # For matrix mode, beta is sparse.
            beta_values = (
                node.sparse_betas[0].val * node.sparse_betas[0].sign
            ).expand(A.size(0), -1, -1)
            # self.single_beta_loc has shape [batch, max_single_split].
            # Need to expand at the specs dimension.
            beta_indices = node.sparse_betas[0].loc.unsqueeze(0).expand(A.size(0), -1, -1)
            beta_bias = node.sparse_betas[0].bias
        # For conv layer, the last dimension is flattened in indices.
        beta_values = beta_values.to(A.dtype)
        if beta_bias is not None:
            beta_bias = beta_bias.expand(A.size(0), -1, -1)
        if node.alpha_beta_update_mask is not None:
            beta_indices = beta_indices[:, node.alpha_beta_update_mask]
            beta_values = beta_values[:, node.alpha_beta_update_mask]
            if beta_bias is not None:
                beta_bias = beta_bias[:, node.alpha_beta_update_mask]
        if uA is not None:
            uA = node.non_deter_scatter_add(
                uA.reshape(uA.size(0), uA.size(1), -1), dim=2,
                index=beta_indices, src=beta_values).view(uA.size())
        if lA is not None:
            lA = node.non_deter_scatter_add(
                lA.reshape(lA.size(0), lA.size(1), -1), dim=2,
                index=beta_indices, src=beta_values.neg()).view(lA.size())
        if beta_bias is not None:
            bias = (beta_values * beta_bias).sum(dim=-1)
            lbias = bias
            ubias = -bias
    else:
        raise RuntimeError(f"Unknown type {type(A)} for A")

    return lA, uA, lbias, ubias


def print_optimized_beta(acts):
    masked_betas = []
    for model in acts:
        masked_betas.append(model.masked_beta)
        if model.history_beta_used:
            print(f'{model.name} history beta', model.new_history_beta.squeeze())
        if model.split_beta_used:
            print(f'{model.name} split beta:', model.split_beta.view(-1))
            print(f'{model.name} bias:', model.split_bias)


================================================
FILE: auto_LiRPA/bound_general.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################

import copy
from typing import List
import numpy as np
import warnings
from collections import OrderedDict, deque

import torch
from torch.nn import Parameter

from .bound_op_map import bound_op_map
from .bound_ops import *
from .bounded_tensor import BoundedTensor, BoundedParameter
from .parse_graph import parse_module
from .perturbations import *
from .utils import *
from .patches import Patches
from .optimized_bounds import default_optimize_bound_args


warnings.simplefilter('once')


class BoundedModule(nn.Module):
    """Bounded module with support for automatically computing bounds.

    Args:
        model (nn.Module): The original model to be wrapped by BoundedModule.

        global_input (tuple): A dummy input to the original model. The shape of
        the dummy input should be consistent with the actual input to the model
        except for the batch dimension.

        bound_opts (dict): Options for bounds. See
        `Bound Options <bound_opts.html>`_.

        device (str or torch.device): Device of the bounded module.
        If 'auto', the device will be automatically inferred from the device of
        parameters in the original model or the dummy input.

        custom_ops (dict): A dictionary of custom operators.
        The dictionary maps operator names to their corresponding bound classes
        (subclasses of `Bound`).

    """
    def __init__(self, model, global_input, bound_opts=None,
                device='auto', verbose=False, custom_ops=None):
        super().__init__()
        if isinstance(model, BoundedModule):
            for key in model.__dict__.keys():
                setattr(self, key, getattr(model, key))
            return

        self.ori_training = model.training

        if bound_opts is None:
            bound_opts = {}
        # Default options.
        default_bound_opts = {
            'conv_mode': 'patches',
            'sparse_intermediate_bounds': True,
            'sparse_conv_intermediate_bounds': True,
            'sparse_intermediate_bounds_with_ibp': True,
            'sparse_features_alpha': True,
            'sparse_spec_alpha': True,
            'minimum_sparsity': 0.9,
            'enable_opt_interm_bounds': False,
            'crown_batch_size': np.inf,
            'forward_refinement': False,
            'forward_max_dim': int(1e9),
            # Do not share alpha for conv layers.
            'use_full_conv_alpha': True,
            'disabled_optimization': [],
            # Threshold for number of unstable neurons for each layer to disable
            #  use_full_conv_alpha.
            'use_full_conv_alpha_thresh': 512,
            'verbosity': 1 if verbose else 0,
            'optimize_graph': {'optimizer': None},
            'compare_crown_with_ibp': False,
            # Whether run an additional forward pass before computing bounds.
            'forward_before_compute_bounds': False,
            'clip_in_alpha_crown': False,
            # Whether to compute bounds for every node in the graph.
            # (rather than only the nodes whose intermediate bounds are needed.)
            'bound_every_node': False,
        }
        default_bound_opts.update(bound_opts)
        self.bound_opts = default_bound_opts
        optimize_bound_args = copy.deepcopy(default_optimize_bound_args)
        optimize_bound_args.update(
            self.bound_opts.get('optimize_bound_args', {}))
        self.bound_opts.update({'optimize_bound_args': optimize_bound_args})

        self.verbose = verbose
        self.custom_ops = custom_ops if custom_ops is not None else {}
        if device == 'auto':
            try:
                self.device = next(model.parameters()).device
            except StopIteration:
                # Model has no parameters. We use the device of input tensor.
                if isinstance(global_input, torch.Tensor):
                    self.device = global_input.device
                elif isinstance(global_input, tuple):
                    self.device = global_input[0].device
                else:
                    raise NotImplementedError( # pylint: disable=raise-missing-from
                        'Unable to decide the device. Consider providing a '
                        '`device` argument to `BoundedModule` explicitly.')
        else:
            self.device = device

        self.global_input = tuple(unpack_inputs(global_input, device=self.device))
        self.check_incompatible_nodes(model)

        self.conv_mode = self.bound_opts.get('conv_mode', 'patches')
        # Cached IBP results which may be reused
        self.ibp_lower, self.ibp_upper = None, None

        self.optimizable_activations = []
        self.relus = []  # save relu layers for convenience
        self.layers_with_constraint = []

        state_dict_copy = copy.deepcopy(model.state_dict())
        object.__setattr__(self, 'ori_state_dict', state_dict_copy)
        model.to(self.device)
        output = model(*self.global_input)
        if not isinstance(output, torch.Tensor):
            raise TypeError(
                'Output of the model is expected to be a single torch.Tensor. '
                f'Actual type: {type(output)}')
        self.final_shape = output.shape
        self.bound_opts.update({'final_shape': self.final_shape})
        self._convert(model, self.global_input)
        self._optimize_graph()
        # Compute forward_value and mark perturbed nodes
        self.forward(*self.global_input)
        self._expand_jacobian()
        self._check_patches_mode()

        self.next_split_hint = []  # Split hints, used in beta optimization.
        # Beta values for all intermediate bounds.
        # Set to None (not used) by default.
        self.best_intermediate_betas = None
        # Initialization value for intermediate betas.
        self.init_intermediate_betas = None
        # whether using cut
        self.cut_used = False
        # a placeholder for cut timestamp, which would be a non-positive int
        self.cut_timestamp = -1
        # a placeholder to save the latest samplewise mask for
        # pruning-in-iteration optimization
        self.last_update_preserve_mask = None
        # If output constraints are used, it is possible that none of the possible
        # inputs satisfy them. In this case, the lower bounds will be set to +inf,
        # and the upper bounds to -inf.
        self.infeasible_bounds = None
        self.solver_model = None
        # Needed for output constraints - the output layer should not use them
        self.final_node().is_final_node = True
        self.dynamic = False
        # This is the topk ratio for half-naive, half-constrained concretization.
        # Please check for concretize_bounds.py for more details.
        self.clip_neuron_selection_type = 'ratio'
        self.clip_neuron_selection_value = -1.0
        # A boolean tensor with shape (batchsize, ). It indicates if a batch is
        # infeasible when concretizing with constraints.
        # Always call `init_infeasible_bounds_constraints` function to initialize it.
        self.infeasible_bounds_constraints = None

        # This is designed for clipping during alpha-CROWN.
        # For each alpha-CROWN optimization iteration, the lA and lbias of the final layer
        #   will be set as `constraints_optimized` for the next iteration.
        # Please check backward_bound.py and optimized_bound for more info.
        self.constraints_optimized = None

    def nodes(self) -> List[Bound]:
        return self._modules.values()

    def get_enabled_opt_act(self):
        # Optimizable activations that are actually used and perturbed
        return [
            n for n in self.optimizable_activations
            if n.used and n.perturbed and not getattr(n, 'is_linear_op', False)
        ]

    def get_optimizable_activations(self):
        for node in self.nodes():
            if (isinstance(node, BoundOptimizableActivation)
                    and node.optimizable
                    and len(getattr(node, 'requires_input_bounds', [])) > 0
                    and node not in self.optimizable_activations):
                disabled = False
                for item in self.bound_opts.get('disable_optimization', []):
                    if item.lower() in str(type(node)).lower():
                        disabled = True
                if disabled:
                    logging.debug('Disabled optimization for %s', node)
                    continue
                if node not in self.optimizable_activations:
                    self.optimizable_activations.append(node)

    def get_perturbed_optimizable_activations(self):
        return [n for n in self.optimizable_activations if n.perturbed]

    def get_splittable_activations(self):
        """Activation functions that can be split during branch and bound."""
        return [n for n in self.nodes() if n.perturbed and n.splittable and n.used]

    def get_layers_requiring_bounds(self):
        """Layer names whose intermediate layer bounds are required."""
        intermediate_layers = []
        tighten_input_bounds = (
            self.bound_opts['optimize_bound_args']['tighten_input_bounds']
        )
        directly_optimize_layer_names = (
            self.bound_opts['optimize_bound_args']['directly_optimize']
        )
        for node in self.nodes():
            if node.name in directly_optimize_layer_names:
                intermediate_layers.append(node)
            if not node.used or not node.perturbed:
                continue
            for i in getattr(node, 'requires_input_bounds', []):
                input_node = node.inputs[i]
                if (input_node not in intermediate_layers
                        and input_node.perturbed):
                    # If not perturbed, it may not have the batch dimension.
                    # So we do not include it, and it is unnecessary.
                    intermediate_layers.append(input_node)
            if (
                node.name in self.layers_with_constraint
                or (isinstance(node, BoundInput) and tighten_input_bounds)
            ):
                if node not in intermediate_layers:
                    intermediate_layers.append(node)
        return intermediate_layers

    def check_incompatible_nodes(self, model):
        """Check whether the model has incompatible nodes that the conversion
        may be inaccurate"""
        node_types = [type(m) for m in list(model.modules())]

        if (torch.nn.Dropout in node_types
                and torch.nn.BatchNorm1d in node_types
                and any(input.shape[0] == 1 for input in self.global_input)):
            # In fact, we just need the input that is involved in the
            # dropout layer to have batch size larger than 1, but we don't know
            # which of them is, so we just check all of them.
            print('We cannot support torch.nn.Dropout and torch.nn.BatchNorm1d '
                  'at the same time!')
            print('Suggest to use another dummy input which has batch size '
                  'larger than 1 and set model to train() mode.')
            return

        if not self.ori_training and torch.nn.Dropout in node_types:
            print('Dropout operation CANNOT be parsed during conversion when '
                  'the model is in eval() mode!')
            print('Set model to train() mode!')
            self.ori_training = True

        if self.ori_training and torch.nn.BatchNorm1d in node_types:
            print('BatchNorm1d may raise error during conversion when the model'
                  ' is in train() mode!')
            print('Set model to eval() mode!')
            self.ori_training = False

    def non_deter_wrapper(self, op, *args, **kwargs):
        """Some operations are non-deterministic and deterministic mode will
        fail. So we temporary disable it."""
        if self.bound_opts.get('deterministic', False):
            torch.use_deterministic_algorithms(False)
        ret = op(*args, **kwargs)
        if self.bound_opts.get('deterministic', False):
            torch.use_deterministic_algorithms(True)
        return ret

    def non_deter_scatter_add(self, *args, **kwargs):
        return self.non_deter_wrapper(torch.scatter_add, *args, **kwargs)

    def non_deter_index_select(self, *args, **kwargs):
        return self.non_deter_wrapper(torch.index_select, *args, **kwargs)

    def set_bound_opts(self, new_opts):
        for k, v in new_opts.items():
            # assert v is not dict, 'only support change optimize_bound_args'
            if type(v) == dict:
                self.bound_opts[k].update(v)
            else:
                self.bound_opts[k] = v

    def set_gcp_relu_indicators(self, relu_layer_name, relu_indicators):
        """
        Sets the GCP (Generalized Cutting Plane) relu indicators for
        the specified ReLU layer by name.
        Args:
            relu_layer_name (str):
                The name of the ReLU layer to update.
            relu_indicators (torch.Tensor):
                A tensor containing unstable relu indices or masks.
        """
        # Search for the layer by name
        for m in self.relus:
            if m.name == relu_layer_name:
                # Set the indicators for the found ReLU layer
                m.gcp_unstable_relu_indicators = relu_indicators
                return
        # If not found, raise an error
        raise ValueError(f'No ReLU layer found with name {relu_layer_name}')

    @staticmethod
    def _get_A_norm(A):
        if not isinstance(A, (list, tuple)):
            A = (A, )
        norms = []
        for aa in A:
            if aa is not None:
                if isinstance(aa, Patches):
                    aa = aa.patches
                norms.append(aa.abs().sum().item())
            else:
                norms.append(None)
        return norms

    def __call__(self, *input, **kwargs):
        if 'method_opt' in kwargs:
            opt = kwargs['method_opt']
            kwargs.pop('method_opt')
        else:
            opt = 'forward'
        for kwarg in [
            'disable_multi_gpu', 'no_replicas', 'get_property',
            'node_class', 'att_name']:
            if kwarg in kwargs:
                kwargs.pop(kwarg)
        if opt == 'compute_bounds':
            return self.compute_bounds(**kwargs)
        else:
            return self.forward(*input, **kwargs)

    def register_parameter(self, name, param):
        r"""Adds a parameter to the module.

        The parameter can be accessed as an attribute using given name.

        Args:
            name (string): name of the parameter. The parameter can be accessed
                from this module using the given name
            param (Parameter): parameter to be added to the module.
        """
        if '_parameters' not in self.__dict__:
            raise AttributeError(
                'cannot assign parameter before Module.__init__() call')
        elif not isinstance(name, str):
            raise TypeError('parameter name should be a string. '
                            f'Got {torch.typename(name)}')
        elif name == '':
            raise KeyError('parameter name can\'t be empty string')
        elif hasattr(self, name) and name not in self._parameters:
            raise KeyError(f'attribute "{name}" already exists')

        if param is None:
            self._parameters[name] = None
        elif not isinstance(param, Parameter):
            raise TypeError(
                f'cannot assign "{torch.typename(param)}" object to '
                f'parameter "{name}" '
                '(torch.nn.Parameter or None required)')
        elif param.grad_fn:
            raise ValueError(
                f'Cannot assign non-leaf Tensor to parameter "{name}". Model '
                'parameters must be created explicitly. To express "{name}" '
                'as a function of another Tensor, compute the value in '
                'the forward() method.')
        else:
            self._parameters[name] = param

    def _named_members(self,
                       get_members_fn,
                       prefix='',
                       recurse=True,
                       remove_duplicate: bool = True,
                       **kwargs):  # pylint: disable=unused-argument
        r"""Helper method for yielding various names + members of modules."""
        memo = set()
        modules = self.named_modules(prefix=prefix) if recurse else [
                                     (prefix, self)]
        for module_prefix, module in modules:
            members = get_members_fn(module)
            for k, v in members:
                if v is None or v in memo:
                    continue
                if remove_duplicate:
                    memo.add(v)
                name = module_prefix + ('.' if module_prefix else '') + k
                # translate name to ori_name
                if name in self.node_name_map:
                    name = self.node_name_map[name]
                yield name, v

    def train(self, mode=True):
        super().train(mode)
        for node in self.nodes():
            node.train(mode=mode)

    def eval(self):
        super().eval()
        for node in self.nodes():
            node.eval()

    def to(self, *args, **kwargs):
        # Moves and/or casts some attributes except pytorch will do by default.
        for node in self.nodes():
            for attr in ['lower', 'upper', 'forward_value', 'd', 'lA',]:
                if hasattr(node, attr):
                    this_attr = getattr(node, attr)
                    if isinstance(this_attr, torch.Tensor):
                        this_attr = this_attr.to(*args, **kwargs)
                        setattr(node, attr, this_attr)

            if hasattr(node, 'interval'):
                # construct new interval
                this_attr = getattr(node, 'interval')
                setattr(node, 'interval', (this_attr[0].to(
                    *args, **kwargs), this_attr[1].to(*args, **kwargs)))

        return super().to(*args, **kwargs)

    def __getitem__(self, name):
        module = self._modules[name]
        # We never create modules that are None, the assert fixes type hints
        assert module is not None
        return module

    def roots(self):
        return [self[name] for name in self.root_names]

    def final_node(self):
        return self[self.final_name]

    def get_forward_value(self, node):
        """ Recursively get `forward_value` for `node` and its parent nodes"""
        if getattr(node, 'forward_value', None) is not None:
            return node.forward_value
        inputs = [self.get_forward_value(inp) for inp in node.inputs]
        for inp in node.inputs:
            node.from_input = node.from_input or inp.from_input
        node.input_shape = inputs[0].shape if len(inputs) > 0 else None
        fv = node.forward(*inputs)
        if isinstance(fv, (torch.Size, tuple)):
            fv = torch.tensor(fv, device=self.device)
        node.forward_value = fv
        node.output_shape = fv.shape
        # In most cases, the batch dimension is just the first dimension
        # if the node depends on input. Otherwise if the node doesn't
        # depend on input, there is no batch dimension (default is -1).
        node.batch_dim = 0 if node.from_input else node.batch_dim
        # Unperturbed node but it is not a root node.
        # Save forward_value to value. (Can be used in forward bounds.)
        if not node.from_input and len(node.inputs) > 0:
            node.value = node.forward_value
        return fv

    def forward(self, *x, final_node_name=None,
                interm_bounds=None,
                clear_forward_only=False,
                reset_perturbed_nodes=True,
                cache_bounds=False):
        r"""Standard forward computation for the network.

        Args:
            x (tuple or None): Input to the model.

            final_node_name (str, optional): The name of the final node in the
            model. The value on the corresponding node will be returned.

            clear_forward_only (bool, default `False`): Whether only standard
            forward values stored on the nodes should be cleared. If `True`,
            only standard forward values stored on the nodes will be cleared.
            Otherwise, bound information on the nodes will also be cleared.

            reset_perturbed_nodes (bool, default `True`): Mark all perturbed
            nodes with input perturbations. When set to `True`, it may
            accidentally clear all .perturbed properties for intermediate
            nodes.

        Returns:
            output: The output of the model, or if `final_node_name` is not
            `None`, return the value on the corresponding node instead.
        """
        self.set_input(*x,
                       interm_bounds=interm_bounds,
                       clear_forward_only=clear_forward_only,
                       reset_perturbed_nodes=reset_perturbed_nodes,
                       cache_bounds=cache_bounds)
        if final_node_name is None:
            final_node_name = self.output_name[0]
        return self.get_forward_value(self[final_node_name])

    def _mark_perturbed_nodes(self, input):
        """Mark the graph nodes and determine which nodes need perturbation."""
        # Set some of the input as perturbed if they are bounded objects
        any_perturbed = False
        for name, index in zip(self.input_name, self.input_index):
            if index is None:
                continue
            if isinstance(input[index], (BoundedTensor, BoundedParameter)):
                self[name].perturbed = True
                any_perturbed = True
        # If none of the inputs is a bounded object, set all of them as perturbed
        if not any_perturbed:
            for name, index in zip(self.input_name, self.input_index):
                if index is not None:
                    self[name].perturbed = True

        degree_in = {}
        queue = deque()
        relus = []
        # Initially the queue contains all "root" nodes.
        for key in self._modules.keys():
            l = self[key]
            degree_in[l.name] = len(l.inputs)
            if degree_in[l.name] == 0:
                queue.append(l)  # in_degree ==0 -> root node

        while len(queue) > 0:
            node = queue.popleft()
            # We set the relu here to ensure the list is sorted according to topological order.
            if isinstance(node, BoundRelu):
                relus.append(node)
            # Obtain all output node, and add the output nodes to the queue if
            # all its input nodes have been visited.
            # The initial "perturbed" property is set in BoundInput or
            # BoundParams object, depending on ptb.
            for name_next in node.output_name:
                node_next = self[name_next]
                if not node_next.never_perturbed:
                    # The next node is perturbed if it is already perturbed,
                    # or this node is perturbed.
                    node_next.perturbed = node_next.perturbed or node.perturbed
                degree_in[name_next] -= 1
                # all inputs of this node have been visited,
                # now put it in queue.
                if degree_in[name_next] == 0:
                    queue.append(node_next)
            node.update_requires_input_bounds()

        self.relus = relus
        self.get_optimizable_activations()
        self.splittable_activations = self.get_splittable_activations()
        self.perturbed_optimizable_activations = (
            self.get_perturbed_optimizable_activations())
        return

    def _check_patches_mode(self):
        """Disable patches mode if there is no Conv node.

        This is a workaround (before a more general patches mode is implemented)
        to avoid issues relevant to the patches node,
        for complicated models without any Conv.
        """
        has_conv = False
        for node in self.nodes():
            if isinstance(node, (BoundConv, BoundConvTranspose, BoundConv2dGrad)):
                has_conv = True
        if not has_conv and self.conv_mode == 'patches':
            self.conv_mode = 'matrix'
            for node in self.nodes():
                if getattr(node, 'mode', None) == 'patches':
                    node.mode = 'matrix'

    def _clear_and_set_new(
        self,
        interm_bounds,
        clear_forward_only=False,
        reset_perturbed_nodes=True,
        cache_bounds=False,
    ):
        for l in self.nodes():
            if hasattr(l, 'linear'):
                if isinstance(l.linear, tuple):
                    for item in l.linear:
                        del item
                delattr(l, 'linear')

            if hasattr(l, 'patch_size'):
                l.patch_size = {}

            if clear_forward_only:
                if hasattr(l, 'forward_value'):
                    delattr(l, 'forward_value')
            else:
                for attr in ['interval', 'forward_value', 'd',
                             'lA', 'lower_d', 'upper_k']:
                    if hasattr(l, attr):
                        delattr(l, attr)
                if cache_bounds:
                    l.move_lower_and_upper_bounds_to_cache()
                else:
                    l.delete_lower_and_upper_bounds()

            for attr in ['zero_backward_coeffs_l', 'zero_backward_coeffs_u',
                         'zero_lA_mtx', 'zero_uA_mtx']:
                setattr(l, attr, False)
            # Given an interval here to make IBP/CROWN start from this node
            if interm_bounds is not None and l.name in interm_bounds.keys():
                l.interval = tuple(interm_bounds[l.name][:2])
                l.lower = interm_bounds[l.name][0]
                l.upper = interm_bounds[l.name][1]
                if l.lower is not None:
                    l.lower = l.lower.detach().requires_grad_(False)
                if l.upper is not None:
                    l.upper = l.upper.detach().requires_grad_(False)
            # Mark all nodes as non-perturbed except for weights.
            if reset_perturbed_nodes:
                if not hasattr(l, 'perturbation') or l.perturbation is None:
                    l.perturbed = False

            # Clear operator-specific attributes
            l.clear()

    def set_input(
        self,
        *x,
        interm_bounds=None,
        clear_forward_only=False,
        reset_perturbed_nodes=True,
        cache_bounds=False,
    ):
        self._clear_and_set_new(
            interm_bounds=interm_bounds,
            clear_forward_only=clear_forward_only,
            reset_perturbed_nodes=reset_perturbed_nodes,
            cache_bounds=cache_bounds,
        )
        inputs_unpacked = unpack_inputs(x)
        for name, index in zip(self.input_name, self.input_index):
            if index is None:
                continue
            node = self[name]
            node.value = inputs_unpacked[index]
            if isinstance(node.value, (BoundedTensor, BoundedParameter)):
                node.perturbation = node.value.ptb
            else:
                node.perturbation = None
        # Mark all perturbed nodes.
        if reset_perturbed_nodes:
            self._mark_perturbed_nodes(inputs_unpacked)

    def _get_node_input(self, nodesOP, nodesIn, node):
        ret = []
        for i in range(len(node.inputs)):
            for op in nodesOP:
                if op.name == node.inputs[i]:
                    ret.append(op.bound_node)
                    break
            if len(ret) == i + 1:
                continue
            for io in nodesIn:
                if io.name == node.inputs[i]:
                    ret.append(io.bound_node)
                    break
            if len(ret) <= i:
                raise ValueError(f'cannot find inputs of node: {node.name}')
        return ret

    def _to(self, obj, dest, inplace=False):
        """ Move all tensors in the object to a specified dest
        (device or dtype). The inplace=True option is available for dict."""
        if obj is None:
            return obj
        elif isinstance(obj, torch.Tensor):
            return obj.to(dest)
        elif isinstance(obj, Patches):
            return obj.patches.to(dest)
        elif isinstance(obj, tuple):
            return tuple([self._to(item, dest) for item in obj])
        elif isinstance(obj, list):
            return list([self._to(item, dest) for item in obj])
        elif isinstance(obj, dict):
            if inplace:
                for k, v in obj.items():
                    obj[k] = self._to(v, dest, inplace=True)
                return obj
            else:
                return {k: self._to(v, dest) for k, v in obj.items()}
        else:
            raise NotImplementedError(type(obj))

    def _convert_nodes(self, model, global_input):
        r"""
        Returns:
            nodesOP (list): List of operator nodes
            nodesIn (list): List of input nodes
            nodesOut (list): List of output nodes
            template (object): Template to specify the output format
        """
        global_input_cpu = self._to(global_input, 'cpu')
        if self.ori_training:
            model.train()
        else:
            model.eval()
        model.to('cpu')
        nodesOP, nodesIn, nodesOut, template = parse_module(
            model, global_input_cpu)
        model.to(self.device)
        for i in range(0, len(nodesIn)):
            if nodesIn[i].param is not None:
                nodesIn[i] = nodesIn[i]._replace(
                    param=nodesIn[i].param.to(self.device))

        # Convert input nodes and parameters.
        attr = {'device': self.device}
        for i, n in enumerate(nodesIn):
            if n.input_index is not None:
                nodesIn[i] = nodesIn[i]._replace(bound_node=BoundInput(
                    ori_name=nodesIn[i].ori_name,
                    value=global_input[nodesIn[i].input_index],
                    perturbation=nodesIn[i].perturbation,
                    input_index=n.input_index, options=self.bound_opts,
                    attr=attr))
            else:
                bound_class = BoundParams if isinstance(
                    nodesIn[i].param, nn.Parameter) else BoundBuffers
                nodesIn[i] = nodesIn[i]._replace(bound_node=bound_class(
                    ori_name=nodesIn[i].ori_name, value=nodesIn[i].param,
                    perturbation=nodesIn[i].perturbation, options=self.bound_opts,
                    attr=attr))

        unsupported_ops = []

        # Convert other operation nodes.
        for n in range(len(nodesOP)):
            attr = nodesOP[n].attr
            inputs = self._get_node_input(nodesOP, nodesIn, nodesOP[n])
            try:
                if nodesOP[n].op in self.custom_ops:
                    op = self.custom_ops[nodesOP[n].op]
                elif nodesOP[n].op in bound_op_map:
                    op = bound_op_map[nodesOP[n].op]
                elif nodesOP[n].op.startswith('aten::ATen'):
                    op = globals()[f'BoundATen{attr["operator"].capitalize()}']
                elif nodesOP[n].op.startswith('onnx::'):
                    op = globals()[f'Bound{nodesOP[n].op[6:]}']
                else:
                    raise KeyError
            except (NameError, KeyError):
                unsupported_ops.append(nodesOP[n])
                logger.error('The node has an unsupported operation: %s',
                             nodesOP[n])
                continue
            attr['device'] = self.device

            # FIXME generalize
            if (nodesOP[n].op == 'onnx::BatchNormalization'
                    or getattr(op, 'TRAINING_FLAG', False)):
                # BatchNormalization node needs model.training flag to set
                # running mean and vars set training=False to avoid wrongly
                # updating running mean/vars during bound wrapper
                nodesOP[n] = nodesOP[n]._replace(bound_node=op(
                    attr, inputs, nodesOP[n].output_index, self.bound_opts,
                    False))
            else:
                nodesOP[n] = nodesOP[n]._replace(bound_node=op(
                    attr, inputs, nodesOP[n].output_index, self.bound_opts))

        if unsupported_ops:
            logger.error('Unsupported operations:')
            for n in unsupported_ops:
                logger.error(f'Name: {n.op}, Attr: {n.attr}')
            raise NotImplementedError('There are unsupported operations')

        for node in nodesIn + nodesOP:
            node.bound_node.name = node.name

        nodes_dict = {}
        for node in nodesOP + nodesIn:
            nodes_dict[node.name] = node.bound_node
        nodesOP = [n.bound_node for n in nodesOP]
        nodesIn = [n.bound_node for n in nodesIn]
        nodesOut = [nodes_dict[n] for n in nodesOut]

        return nodesOP, nodesIn, nodesOut, template

    def _build_graph(self, nodesOP, nodesIn, nodesOut, template):
        # We were assuming that the original model had only one output node.
        assert len(nodesOut) == 1
        self.final_name = nodesOut[0].name
        self.input_name, self.input_index, self.root_names = [], [], []
        self.output_name = [n.name for n in nodesOut]
        self.output_template = template
        self._modules.clear()
        for node in nodesIn:
            self.add_input_node(node, index=node.input_index)
        self.add_nodes(nodesOP)
        if self.conv_mode == 'patches':
            self.root_names: List[str] = [node.name for node in nodesIn]

    def rename_nodes(self, nodesOP, nodesIn, rename_dict):
        def rename(node):
            node.name = rename_dict[node.name]
            return node
        for i in range(len(nodesOP)):
            nodesOP[i] = rename(nodesOP[i])
        for i in range(len(nodesIn)):
            nodesIn[i] = rename(nodesIn[i])

    def _split_complex(self, nodesOP, nodesIn):
        finished = True
        for n in range(len(nodesOP)):
            if hasattr(nodesOP[n], 'complex') and nodesOP[n].complex:
                complex_node = nodesOP[n]

                finished = False
                _nodesOP, _nodesIn, _nodesOut, _ = self._convert_nodes(
                    nodesOP[n].model, nodesOP[n].input)
                # assuming each supported complex operation only has one output
                assert len(_nodesOut) == 1

                name_base = nodesOP[n].name + '/split'
                rename_dict = {}
                for node in _nodesOP + _nodesIn:
                    rename_dict[node.name] = name_base + node.name
                num_inputs = len(nodesOP[n].inputs)
                for i in range(num_inputs):
                    rename_dict[_nodesIn[i].name] = nodesOP[n].input_name[i]
                rename_dict[_nodesOP[-1].name] = nodesOP[n].name

                self.rename_nodes(_nodesOP, _nodesIn, rename_dict)

                output_name = _nodesOP[-1].name
                # Any input node of some node within the complex node should be
                # replaced with the complex node's corresponding input node.
                for node in _nodesOP:
                    for i in range(len(node.inputs)):
                        if node.input_name[i] in nodesOP[n].input_name:
                            index = nodesOP[n].input_name.index(
                                node.input_name[i])
                            node.inputs[i] = nodesOP[n].inputs[index]
                # For any output node of this complex node,
                # modify its input node.
                for node in nodesOP:
                    if output_name in node.input_name:
                        index = node.input_name.index(output_name)
                        node.inputs[index] = _nodesOP[-1]
                # Mark where the nodes come from
                for node in _nodesOP:
                    node.from_complex_node = type(complex_node).__name__

                nodesOP = nodesOP[:n] + _nodesOP + nodesOP[(n + 1):]
                nodesIn = nodesIn + _nodesIn[num_inputs:]

                break

        return nodesOP, nodesIn, finished

    def _get_node_name_map(self):
        """Build a dict with {ori_name: name, name: ori_name}"""
        self.node_name_map = {}
        for node in self.nodes():
            if isinstance(node, (BoundInput, BoundParams)):
                for p in list(node.named_parameters()):
                    if node.ori_name not in self.node_name_map:
                        name = f'{node.name}.{p[0]}'
                        self.node_name_map[node.ori_name] = name
                        self.node_name_map[name] = node.ori_name
                for p in list(node.named_buffers()):
                    if node.ori_name not in self.node_name_map:
                        name = f'{node.name}.{p[0]}'
                        self.node_name_map[node.ori_name] = name
                        self.node_name_map[name] = node.ori_name

    # convert a Pytorch model to a model with bounds
    def _convert(self, model, global_input):
        if self.verbose:
            logger.info('Converting the model...')

        self.num_global_inputs = len(global_input)

        nodesOP, nodesIn, nodesOut, template = self._convert_nodes(
            model, global_input)
        global_input = self._to(global_input, self.device)

        while True:
            self._build_graph(nodesOP, nodesIn, nodesOut, template)
            self.forward(*global_input)  # running means/vars changed
            nodesOP, nodesIn, finished = self._split_complex(nodesOP, nodesIn)
            if finished:
                break

        self._get_node_name_map()

        ori_state_dict_mapped = OrderedDict()
        for k, v in self.ori_state_dict.items():
            if k in self.node_name_map:
                ori_state_dict_mapped[self.node_name_map[k]] = v
        self.load_state_dict(ori_state_dict_mapped)
        if self.ori_training:
            model.load_state_dict(self.ori_state_dict)
        delattr(self, 'ori_state_dict')

        # The name of the final node used in the last call to `compute_bounds`
        self.last_final_node_name = None

        if self.verbose:
            logger.info('Model converted to support bounds')

    def check_prior_bounds(self, node, C=None):
        if node.prior_checked or not (node.used and node.perturbed):
            return
        if C is not None and isinstance(node, BoundConcat):
            # If the last node is a BoundConcat, it's possible that only some of
            # the input nodes of the BoundConcat are needed in the specification.
            # In this case, we only check the bounds of the input nodes that are
            # actually used in the specification. All other branches are
            # considered as not used, and their bounds are not checked.
            # FIXME: In this case, node.used of some nodes may be incorrect.
            offset = 0
            assert isinstance(C, torch.Tensor) and C.ndim == 3
            C = C.abs().sum(dim=[0, 1])
            for node_input in node.inputs:
                size = prod(node_input.output_shape[1:])
                C_s = C[offset:offset+size].sum()
                if (C_s != 0).any():
                    self.check_prior_bounds(node_input)
                offset += size
        else:
            for n in node.inputs:
                self.check_prior_bounds(n)
        tighten_input_bounds = (
            self.bound_opts['optimize_bound_args']['tighten_input_bounds']
        )
        directly_optimize_layer_names = (
            self.bound_opts['optimize_bound_args']['directly_optimize']
        )
        bound_every_node = (
            self.bound_opts['bound_every_node']
        )
        for i in range(len(node.inputs)):
            if (
                i in node.requires_input_bounds
                or not node.inputs[i].perturbed
                or node.inputs[i].name in self.layers_with_constraint
                # allows to tighten input bounds
                or (isinstance(node.inputs[i], BoundInput) and tighten_input_bounds)
                # layers whos optimization is forced
                # (for consecutive layers introduced as part of invprop)
                or node.inputs[i].name in directly_optimize_layer_names
                or bound_every_node
            ):
                self.compute_intermediate_bounds(
                    node.inputs[i], prior_checked=True)
        node.prior_checked = True

    def compute_intermediate_bounds(self, node: Bound, prior_checked=False):
        tighten_input_bounds = (
            self.bound_opts['optimize_bound_args']['tighten_input_bounds']
        )
        directly_optimize_layer_names = (
            self.bound_opts['optimize_bound_args']['directly_optimize']
        )
        best_of_oc_and_no_oc = (
            self.bound_opts['optimize_bound_args']['best_of_oc_and_no_oc']
        )
        if (
            node.is_lower_bound_current()
            and not (
                isinstance(node, BoundInput) and tighten_input_bounds
                or node.name in directly_optimize_layer_names
            )
        ):
            if node.name in self.layers_with_constraint:
                node.clamp_interim_bounds()
            return

        logger.debug(f'Getting the bounds of {node}')

        if not prior_checked:
            self.check_prior_bounds(node)

        if not node.perturbed:
            fv = self.get_forward_value(node)
            node.interval = node.lower, node.upper = fv, fv
            return

        # FIXME check that weight perturbation is not affected
        #      (from_input=True should be set for weights)
        if not node.from_input and hasattr(node, 'forward_value'):
            node.lower = node.upper = self.get_forward_value(node)
            return

        reference_bounds = self.reference_bounds

        if self.use_forward:
            # forward
            node.lower, node.upper = self.forward_general(
                node=node, concretize=True)
        else:
            # backward
            if self.check_IBP_intermediate(node):
                # Intermediate bounds for some operators are directly
                # computed from their input nodes by IBP
                # (such as BoundRelu, BoundNeg)
                logger.debug('IBP propagation for intermediate bounds on %s', node)
            # For the first linear layer, IBP can give the same tightness as CROWN.
            elif not self.check_IBP_first_linear(node):
                ref_intermediate = self.get_ref_intermediate_bounds(node)
                sparse_C = self.get_sparse_C(node, ref_intermediate)
                newC, reduced_dim, unstable_idx, unstable_size = sparse_C

                # Special case for BoundRelu when sparse intermediate bounds are disabled
                # Currently sparse intermediate bounds are restricted to ReLU models only
                skip = False
                if unstable_idx is None:
                    if (len(node.output_name) == 1
                            and isinstance(self[node.output_name[0]], BoundTwoPieceLinear)
                            and node.name in self.reference_bounds):
                        lower, upper = self.reference_bounds[node.name]
                        fully_stable = torch.logical_or(lower>=0, upper<=0).all()
                        if fully_stable:
                            node.lower, node.upper = lower, upper
                            skip = True
                elif unstable_size == 0:
                    skip = True

                if not skip:
                    apply_output_constraints_to = self.bound_opts[
                        'optimize_bound_args']['apply_output_constraints_to']
                    if self.return_A:
                        node.lower, node.upper, _ = self.backward_general(
                            node, newC, unstable_idx=unstable_idx,
                            apply_output_constraints_to=apply_output_constraints_to)
                    else:
                        # Compute backward bounds only when there are unstable
                        # neurons, or when we don't know which neurons are unstable.
                        node.lower, node.upper = self.backward_general(
                            node, newC, unstable_idx=unstable_idx,
                            apply_output_constraints_to=apply_output_constraints_to)
                    if torch.any((node.upper - node.lower).abs() > 1e10):
                        if len(apply_output_constraints_to) > 0 and not best_of_oc_and_no_oc:
                            warnings.warn('Very weak bounds detected. This can potentially be '
                                'fixed by setting best_of_oc_and_no_oc=True.')

                if reduced_dim:
                    self.restore_sparse_bounds(
                        node, unstable_idx, unstable_size, ref_intermediate)

                if self.bound_opts['compare_crown_with_ibp']:
                    node.lower, node.upper = self.compare_with_IBP(node, node.lower, node.upper)

        # node.lower and node.upper (intermediate bounds) are computed in
        # the above function. If we have bound references, we set them here
        # to always obtain a better set of bounds.
        if node.name in reference_bounds:
            ref_bounds = reference_bounds[node.name]
            # Initially, the reference bound and the computed bound can be
            # exactly the same when intermediate layer beta is 0. This will
            # prevent gradients flow. So we need a small guard here.
            # Set the intermediate layer bounds using reference bounds,
            # always choosing the tighter one.
            # Assert no NaNs in reference bounds before using them
            assert not torch.isnan(ref_bounds[0]).any(), (
                f'NaN detected in reference lower bound of layer {node.name}')
            node.lower = (torch.max(ref_bounds[0], node.lower).detach()
                          - node.lower.detach() + node.lower)
            assert not torch.isnan(ref_bounds[1]).any(), (
                f'NaN detected in reference upper bound of layer {node.name}')
            node.upper = (node.upper - (node.upper.detach()
                          - torch.min(ref_bounds[1], node.upper).detach()))
            # Also update bounds in node.linear (if exist)
            if hasattr(node, 'linear'):
                node.linear.lower = node.lower
                node.linear.upper = node.upper
            # Otherwise, we only use reference bounds to check which neurons
            # are unstable.

        # prior constraint bounds
        if node.name in self.layers_with_constraint:
            node.clamp_interim_bounds()
        # FIXME (12/28): we should be consistent, and only use
        # node.interval, do not use node.lower or node.upper!
        node.interval = (node.lower, node.upper)

    def get_ref_intermediate_bounds(self, node):
        sparse_intermediate_bounds_with_ibp = self.bound_opts.get(
            'sparse_intermediate_bounds_with_ibp', True)
        # Sparse intermediate bounds can be enabled
        # if aux_reference_bounds are given.
        # (this is enabled for ReLU only, and not for other activations.)
        sparse_intermediate_bounds = (self.bound_opts.get(
            'sparse_intermediate_bounds', False)
            and isinstance(self[node.output_name[0]], BoundRelu))

        ref_intermediate_lb, ref_intermediate_ub = None, None
        if sparse_intermediate_bounds:
            if node.name not in self.aux_reference_bounds:
                # If aux_reference_bounds are not available,
                # we can use IBP to compute these bounds.
                if sparse_intermediate_bounds_with_ibp:
                    with torch.no_grad():
                        # Get IBP bounds for this layer;
                        # we set delete_bounds_after_use=True which does
                        # not save extra intermediate bound tensors.
                        ret_ibp = self.IBP_general(
                            node=node, delete_bounds_after_use=True)
                        ref_intermediate_lb = ret_ibp[0]
                        ref_intermediate_ub = ret_ibp[1]
                else:
                    sparse_intermediate_bounds = False
            else:
                aux_bounds = self.aux_reference_bounds[node.name]
                ref_intermediate_lb, ref_intermediate_ub = aux_bounds

        return sparse_intermediate_bounds, ref_intermediate_lb, ref_intermediate_ub

    def merge_A_dict(self, lA_dict, uA_dict):
        merged_A = {}
        for output_node_name in lA_dict:
            merged_A[output_node_name] = {}
            lA_dict_ = lA_dict[output_node_name]
            uA_dict_ = uA_dict[output_node_name]
            for input_node_name in lA_dict_:
                merged_A[output_node_name][input_node_name] = {
                    'lA': lA_dict_[input_node_name]['lA'],
                    'uA': uA_dict_[input_node_name]['uA'],
                    'lbias': lA_dict_[input_node_name]['lbias'],
                    'ubias': uA_dict_[input_node_name]['ubias'],
                }
        return merged_A

    def compute_bounds(
            self, x=None, aux=None, C=None, method='backward', IBP=False,
            forward=False, bound_lower=True, bound_upper=True, reuse_ibp=False,
            reuse_alpha=False, return_A=False, needed_A_dict=None,
            final_node_name=None, average_A=False,
            interm_bounds=None, reference_bounds=None,
            intermediate_constr=None, alpha_idx=None,
            aux_reference_bounds=None, need_A_only=False,
            cutter=None, decision_thresh=None,
            update_mask=None, ibp_nodes=None, cache_bounds=False):
        r"""Main function for computing bounds.

        Args:
            x (tuple or None): Input to the model. If it is None, the input
            from the last `forward` or `compute_bounds` call is reused.
            Otherwise: the number of elements in the tuple should be
            equal to the number of input nodes in the model, and each element in
            the tuple corresponds to the value for each input node respectively.
            It should look similar as the `global_input` argument when used for
            creating a `BoundedModule`.

            aux (object, optional): Auxliary information that can be passed to
            `Perturbation` classes for initializing and concretizing bounds,
            e.g., additional information for supporting synonym word subsitution
            perturbaiton.

            C (Tensor): The specification matrix that can map the output of the
            model with an additional linear layer. This is usually used for
            maping the logits output of the model to classification margins.

            method (str): The main method for bound computation. Choices:
                * `IBP`: purely use Interval Bound Propagation (IBP) bounds.
                * `CROWN-IBP`: use IBP to compute intermediate bounds,
                but use CROWN (backward mode LiRPA) to compute the bounds of the
                final node.
                * `CROWN`: purely use CROWN to compute bounds for intermediate
                nodes and the final node.
                * `Forward`: purely use forward mode LiRPA.
                * `Forward+Backward`: use forward mode LiRPA for intermediate
                nodes, but further use CROWN for the final node.
                * `CROWN-Optimized` or `alpha-CROWN`: use CROWN, and also
                optimize the linear relaxation parameters for activations.
                * `forward-optimized`: use forward bounds with optimized linear
                relaxation.
                * `dynamic-forward`: use dynamic forward bound propagation where
                new input variables may be dynamically introduced for
                nonlinearities.
                * `dynamic-forward+backward`: use dynamic forward mode for
                intermediate nodes, but use CROWN for the final node.

            IBP (bool, optional): If `True`, use IBP to compute the bounds of
            intermediate nodes. It can be automatically set according to
            `method`.

            forward (bool, optional): If `True`, use the forward mode bound
            propagation to compute the bounds of intermediate nodes. It can be
            automatically set according to `method`.

            bound_lower (bool, default `True`): If `True`, the lower bounds of
            the output needs to be computed.

            bound_upper (bool, default `True`): If `True`, the upper bounds of
            the output needs to be computed.

            reuse_ibp (bool, optional): If `True` and `method` is None, reuse
            the previously saved IBP bounds.

            final_node_name (str, optional): Set the final node in the
            computational graph for bound computation. By default, the final
            node of the originally built computational graph is used.

            return_A (bool, optional): If `True`, return linear coefficients
            in bound propagation (`A` tensors) with `needed_A_dict` set.

            needed_A_dict (dict, optional): A dictionary specifying linear
            coefficients (`A` tensors) that are needed and should be returned.
            Each key in the dictionary is the name of a starting node in
            backward bound propagation, with a list as the value for the key,
            which specifies the names of the ending nodes in backward bound
            propagation, and the linear coefficients of the starting node w.r.t.
            the specified ending nodes are returned. By default, it is empty.

            reuse_alpha (bool, optional): If `True`, reuse previously saved
            alpha values when they are not being optimized.

            decision_thresh (float, optional): In CROWN-optimized mode, we will
            use this decision_thresh to dynamically optimize those domains that
            <= the threshold.

            interm_bounds: A dictionary of 2-element tuple/list
            containing lower and upper bounds for intermediate layers.
            The dictionary keys should include the names of the layers whose
            bounds should be set without recomputation. The layer names can be
            viewed by setting environment variable AUTOLIRPA_DEBUG=1.
            The values of each dictionary elements are (lower_bounds,
            upper_bounds) where "lower_bounds" and "upper_bounds" are two
            tensors with the same shape as the output shape of this layer. If
            you only need to set intermediate layer bounds for certain layers,
            then just include these layers' names in the dictionary.

            reference_bounds: Format is similar to "interm_bounds".
            However, these bounds are only used as a reference, and the bounds
            for intermediate layers will still be computed (e.g., using CROWN,
            IBP or other specified methods). The computed bounds will be
            compared to "reference_bounds" and the tighter one between the two
            will be used.

            aux_reference_bounds: Format is similar to intermediate layer
            bounds. However, these bounds are only used for determine which
            neurons are stable and which neurons are unstable for ReLU networks.
            Unstable neurons' intermediate layer bounds will be recomputed.

            cache_bounds: If `True`, the currently set lower and upper bounds will not
            be deleted, but cached for use by the INVPROP algorithm. This should not be
            set by the user, but only in `_get_optimized_bounds`.

        Returns:
            bound (tuple): When `return_A` is `False`, return a tuple of
            the computed lower bound and upper bound. When `return_A`
            is `True`, return a tuple of lower bound, upper bound, and
            `A` dictionary.
        """
        # This method only prepares everything by setting all required parameters.
        # The main logic is located in `_compute_bounds_main`. It may be called
        # repeatedly for CROWN optimizations.
        logger.debug(f'Compute bounds with {method}')
        if needed_A_dict is None: needed_A_dict = {}
        if not bound_lower and not bound_upper:
            raise ValueError(
                'At least one of bound_lower and bound_upper must be True')

        # Several shortcuts.
        compute_optimized = False
        method = method.lower() if method is not None else method
        if method == 'ibp':
            # Pure IBP bounds.
            method, IBP = None, True
        elif method in ['ibp+backward', 'ibp+crown', 'crown-ibp']:
            method, IBP = 'backward', True
        elif method == 'crown':
            method = 'backward'
        elif method == 'forward':
            forward = True
            self.dynamic = False
        elif method == 'dynamic-forward':
            forward = True
            self.dynamic = True
        elif method == 'forward+backward' or method == 'forward+crown':
            method, forward = 'backward', True
        elif method == 'dynamic-forward+backward' or method == 'dynamic-forward+crown':
            self.dynamic = True
            method, forward = 'backward', True
        elif method in ['crown-optimized', 'alpha-crown', 'forward-optimized']:
            # Lower and upper bounds need two separate rounds of optimization.
            if method == 'forward-optimized':
                method = 'forward'
            else:
                method = 'backward'
            compute_optimized = True

        if reference_bounds is None:
            reference_bounds = {}
        if aux_reference_bounds is None:
            aux_reference_bounds = {}

        # If y in self.backward_node_pairs[x], then node y is visited when
        # doing backward bound propagation starting from node x.
        self.backward_from = dict([(node, []) for node in self._modules])

        if not bound_lower and not bound_upper:
            raise ValueError(
                'At least one of bound_lower and bound_upper in compute_bounds '
                'should be True')
        A_dict = {} if return_A else None

        if x is not None:
            if isinstance(x, torch.Tensor):
                x = (x,)
            if self.bound_opts['forward_before_compute_bounds']:
                self.forward(*x, interm_bounds=interm_bounds, cache_bounds=cache_bounds)
            else:
                self.set_input(*x, interm_bounds=interm_bounds, cache_bounds=cache_bounds)

        roots = self.roots()
        batch_size = roots[0].value.shape[0]
        dim_in = 0

        for i in range(len(roots)):
            value = roots[i].forward()
            if getattr(roots[i], 'perturbation', None) is not None:
                ret_init = roots[i].perturbation.init(
                    value, aux=aux, forward=forward)
                roots[i].linear, roots[i].center, roots[i].aux = ret_init
                # This input/parameter has perturbation.
                # Create an interval object.
                roots[i].interval = Interval(
                    roots[i].linear.lower, roots[i].linear.upper,
                    ptb=roots[i].perturbation)
                if forward:
                    roots[i].dim = roots[i].linear.lw.shape[1]
                    dim_in += roots[i].dim

            else:
                # This input/parameter does not has perturbation.
                # Use plain tuple defaulting to Linf perturbation.
                roots[i].interval = (value, value)
                roots[i].forward_value = roots[i].value = value
                roots[i].center = roots[i].lower = roots[i].upper = value

            roots[i].lower, roots[i].upper = roots[i].interval

        if forward:
            self.init_forward(roots, dim_in)

        for n in self.nodes():
            if isinstance(n, BoundRelu):
                for node in n.inputs:
                    if hasattr(node, 'relu_followed'):
                        node.relu_followed = True

            # Inject update mask inside the activations
            # update_mask: None or bool tensor([batch_size])
            # If set to a tensor, only update the alpha and beta of selected
            # element (with element=1).
            n.alpha_beta_update_mask = update_mask

        final = (self.final_node() if final_node_name is None
                 else self[final_node_name])
        # BFS to find out whether each node is used given the current final node
        self._set_used_nodes(final)

        self.use_forward = forward
        self.batch_size = batch_size
        self.dim_in = dim_in
        self.return_A = return_A
        self.A_dict = A_dict
        self.needed_A_dict = needed_A_dict
        self.intermediate_constr = intermediate_constr
        self.reference_bounds = reference_bounds
        self.aux_reference_bounds = aux_reference_bounds
        self.final_node_name = final.name
        self.ibp_nodes = ibp_nodes

        if compute_optimized:
            kwargs = dict(x=x, C=C, method=method, interm_bounds=interm_bounds,
                reference_bounds=reference_bounds, return_A=return_A,
                aux_reference_bounds=aux_reference_bounds,
                needed_A_dict=needed_A_dict,
                final_node_name=final_node_name,
                cutter=cutter, decision_thresh=decision_thresh)
            if bound_upper:
                ret2 = self._get_optimized_bounds(bound_side='upper', **kwargs)
            else:
                ret2 = None
            if bound_lower:
                ret1 = self._get_optimized_bounds(bound_side='lower', **kwargs)
            else:
                ret1 = None
            if bound_lower and bound_upper:
                if return_A:
                    # Needs to merge the A dictionary.
                    return ret1[0], ret2[1], self.merge_A_dict(ret1[2], ret2[2])
                else:
                    return ret1[0], ret2[1]
            elif bound_lower:
                return ret1  # ret1[1] is None.
            elif bound_upper:
                return ret2  # ret2[0] is None.

        return self._compute_bounds_main(C=C,
                                         method=method,
                                         IBP=IBP,
                                         bound_lower=bound_lower,
                                         bound_upper=bound_upper,
                                         reuse_ibp=reuse_ibp,
                                         reuse_alpha=reuse_alpha,
                                         average_A=average_A,
                                         alpha_idx=alpha_idx,
                                         need_A_only=need_A_only,
                                         update_mask=update_mask)

    def save_intermediate(self, save_path=None):
        r"""A function for saving intermediate bounds.

        Please call this function after `compute_bounds`, or it will output
        IBP bounds by default.

        Args:
            save_path (str, default `None`): If `None`, the intermediate bounds
            will not be saved, or it will be saved at the designated path.

        Returns:
            save_dict (dict): Return a dictionary of lower and upper bounds, with
            the key being the name of the layer.
        """
        save_dict = OrderedDict()
        for node in self.nodes():
            if node.used and node.perturbed:
                if not hasattr(node, 'interval'):
                    ibp_lower, ibp_upper = self.IBP_general(node,
                        delete_bounds_after_use=True)
                    dim_output = int(prod(node.output_shape[1:]))
                    C = torch.eye(dim_output, device=self.device).expand(
                        self.batch_size, dim_output, dim_output)
                    crown_lower, crown_upper = self.backward_general(node, C=C)
                    save_dict[node.name] = (
                        torch.max(crown_lower, ibp_lower),
                        torch.min(crown_upper, ibp_upper))
                else:
                    save_dict[node.name] = (node.lower, node.upper)

        if save_path is not None:
            torch.save(save_dict, save_path)
        return save_dict

    def _compute_bounds_main(self, C=None, method='backward', IBP=False,
            bound_lower=True, bound_upper=True, reuse_ibp=False,
            reuse_alpha=False, average_A=False, alpha_idx=None,
            need_A_only=False, update_mask=None):
        """The core implementation of compute_bounds.

        Seperated because compute_bounds may call _get_optimized_bounds which
        repeatedly calls this method. Otherwise, the preprocessing done in
        compute_bounds would be executed for each iteration.
        """

        final = (self.final_node() if self.final_node_name is None
                 else self[self.final_node_name])
        logger.debug(f'Final node {final.__class__.__name__}({final.name})')

        if IBP and method is None and reuse_ibp:
            # directly return the previously saved ibp bounds
            return self.ibp_lower, self.ibp_upper

        if IBP:
            self.ibp_lower, self.ibp_upper = self.IBP_general(node=final, C=C)

        if method is None:
            return self.ibp_lower, self.ibp_upper

        # TODO: if compute_bounds is called with a method that causes alphas to be
        # optimized, C will be allocated in each iteration. We could allocate it once
        # in compute_bounds, but e.g. `IBP_general` and code in `_get_optimized_bounds`
        # relies on the fact that it can be None
        if C is None:
            # C is an identity matrix by default
            if final.output_shape is None:
                raise ValueError(
                    f'C is not missing while node {final} has no default shape')
            dim_output = int(prod(final.output_shape[1:]))
            # TODO: use an eyeC object here.
            C = torch.eye(dim_output, device=self.device).expand(
                self.batch_size, dim_output, dim_output)

        # Reuse previously saved alpha values,
        # even if they are not optimized now
        # This must be done here instead of `compute_bounds`, as other code might change
        # it (e.g. `_get_optimized_bounds`)
        if reuse_alpha:
            self.opt_reuse()
        else:
            self.opt_no_reuse()

        for node in self.nodes():
            # All nodes may need to be recomputed
            node.prior_checked = False

        self.check_prior_bounds(final, C=C)

        if method == 'backward':
            apply_output_constraints_to = (
                self.bound_opts['optimize_bound_args']['apply_output_constraints_to']
            )
            # This is for the final output bound.
            # No need to pass in intermediate layer beta constraints.
            ret = self.backward_general(
                final, C,
                bound_lower=bound_lower, bound_upper=bound_upper,
                average_A=average_A, need_A_only=need_A_only,
                unstable_idx=alpha_idx, update_mask=update_mask,
                apply_output_constraints_to=apply_output_constraints_to)

            if self.bound_opts['compare_crown_with_ibp']:
                new_lower, new_upper = self.compare_with_IBP(final, lower=ret[0], upper=ret[1], C=C)
                ret = (new_lower, new_upper) + ret[2:]

            # FIXME when C is specified, lower and upper should not be saved to
            # final.lower and final.upper, because they are not the bounds for
            # the node.
            final.lower, final.upper = ret[0], ret[1]

            return ret
        elif method == 'forward' or method == 'dynamic-forward':
            return self.forward_general(C=C, node=final, concretize=True)
        else:
            raise NotImplementedError

    def _set_used_nodes(self, final):
        # By default, all *.used are initialized to False.
        # We set the used nodes by BFS from the final node.
        if final.name != self.last_final_node_name:
            self.last_final_node_name = final.name
            final.used = True
            queue = deque([final])
            while len(queue) > 0:
                n = queue.popleft()
                for n_pre in n.inputs:
                    if not n_pre.used:
                        n_pre.used = True
                        queue.append(n_pre)
        # Based on "used" and "perturbed" properties, find out which
        # layer requires intermediate layer bounds.
        self.layers_requiring_bounds = self.get_layers_requiring_bounds()

    def init_infeasible_bounds_constraints(self, batchsize, device):
        '''Simply initialize the infeasible bound record.'''
        self.infeasible_bounds_constraints = torch.full((batchsize,), False, device=device)

    from .interval_bound import (
        IBP_general, _IBP_loss_fusion, check_IBP_intermediate,
        check_IBP_first_linear, compare_with_IBP)
    from .forward_bound import (
        forward_general, forward_general_dynamic, forward_refinement, init_forward)
    from .backward_bound import (
        backward_general, get_sparse_C,
        check_optimized_variable_sparsity, restore_sparse_bounds,
        get_alpha_crown_start_nodes, get_unstable_locations, batched_backward,
        _preprocess_C)
    from .output_constraints import (
        backward_general_with_output_constraint, invprop_enabled,
        backward_general_invprop, invprop_init_infeasible_bounds,
        invprop_check_infeasible_bounds)
    from .optimized_bounds import (
        _get_optimized_bounds, init_alpha, update_best_beta,
        opt_reuse, opt_no_reuse, _to_float64, _to_default_dtype)
    from .beta_crown import (beta_crown_backward_bound, reset_beta, set_beta,
                             set_beta_cuts, get_split_nodes)
    from .jacobian import (compute_jacobian_bounds, _expand_jacobian)
    from .optimize_graph import _optimize_graph
    from .edit_graph import add_nodes, add_input_node, delete_node, replace_node
    from .tools import visualize
    from .concretize_bounds import (
        concretize_bounds, concretize_root, backward_concretize, forward_concretize)


    from .solver_module import (
        build_solver_module, _build_solver_input, _build_solver_general,
        _reset_solver_vars, _reset_solver_model)


================================================
FILE: auto_LiRPA/bound_multi_gpu.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
from torch.nn import DataParallel
from .perturbations import *
from .bounded_tensor import BoundedTensor
from itertools import chain

class BoundDataParallel(DataParallel):
    # https://github.com/huanzhang12/CROWN-IBP/blob/master/bound_layers.py
    # This is a customized DataParallel class for our project
    def __init__(self, *inputs, **kwargs):
        super(BoundDataParallel, self).__init__(*inputs, **kwargs)
        self._replicas = None

    # Overide the forward method
    def forward(self, *inputs, **kwargs):
        disable_multi_gpu = False  # forward by single GPU
        no_replicas = False  # forward by multi GPUs but without replicate
        if "disable_multi_gpu" in kwargs:
            disable_multi_gpu = kwargs["disable_multi_gpu"]
            kwargs.pop("disable_multi_gpu")

        if "no_replicas" in kwargs:
            no_replicas = kwargs["no_replicas"]
            kwargs.pop("no_replicas")

        if not self.device_ids or disable_multi_gpu:
            if kwargs.pop("get_property", False):
                return self.get_property(self, *inputs, **kwargs)
            return self.module(*inputs, **kwargs)

        if kwargs.pop("get_property", False):
            if self._replicas is None:
                assert 0, 'please call IBP/CROWN before get_property'
            if len(self.device_ids) == 1:
                return self.get_property(self.module, **kwargs)
            inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
            kwargs = list(kwargs)
            for i in range(len(kwargs)):
                kwargs[i]['model'] = self._replicas[i]
            outputs = self.parallel_apply([self.get_property] * len(kwargs), inputs, kwargs)
            return self.gather(outputs, self.output_device)

        # Only replicate during forward/IBP propagation. Not during interval bounds
        # and CROWN-IBP bounds, since weights have not been updated. This saves 2/3
        # of communication cost.
        if not no_replicas:
            if self._replicas is None:  # first time
                self._replicas = self.replicate(self.module, self.device_ids)
            elif kwargs.get("method_opt", "forward") == "forward":
                self._replicas = self.replicate(self.module, self.device_ids)
            elif kwargs.get("x") is not None and kwargs.get("IBP") is True:  #
                self._replicas = self.replicate(self.module, self.device_ids)
            # Update the input nodes to the ones within each replica respectively
            for bounded_module in self._replicas:
                for node in bounded_module._modules.values():
                    node.inputs = [bounded_module[name] for name in node.input_name]

        for t in chain(self.module.parameters(), self.module.buffers()):
            if t.device != self.src_device_obj:
                raise RuntimeError("module must have its parameters and buffers "
                                   "on device {} (device_ids[0]) but found one of "
                                   "them on device: {}".format(self.src_device_obj, t.device))

        # TODO: can be done in parallel, only support same ptb for all inputs per forward/IBP propagation
        if len(inputs) > 0 and hasattr(inputs[0], 'ptb') and inputs[0].ptb is not None:
            # compute bounds without x
            # inputs_scatter is a normal tensor, we need to assign ptb to it if inputs is a BoundedTensor
            inputs_scatter, kwargs = self.scatter((inputs, inputs[0].ptb.x_L, inputs[0].ptb.x_U), kwargs,
                                                  self.device_ids)
            # inputs_scatter = inputs_scatter[0]
            bounded_inputs = []
            for input_s in inputs_scatter:  # GPU numbers
                # FIXME other perturbations are not supported yet
                assert isinstance(inputs[0].ptb, PerturbationLpNorm)
                ptb = PerturbationLpNorm(norm=inputs[0].ptb.norm, eps=inputs[0].ptb.eps, x_L=input_s[1], x_U=input_s[2])
                input_s = list(input_s[0])
                input_s[0] = BoundedTensor(input_s[0], ptb)
                input_s = tuple(input_s)
                bounded_inputs.append(input_s)

            # bounded_inputs = tuple(bounded_inputs)
        elif kwargs.get("x") is not None and hasattr(kwargs.get("x")[0], 'ptb') and kwargs.get("x")[0].ptb is not None:
            # compute bounds with x
            # kwargs['x'] is a normal tensor, we need to assign ptb to it
            x = kwargs.get("x")[0]
            bounded_inputs = []
            inputs_scatter, kwargs = self.scatter((inputs, x.ptb.x_L, x.ptb.x_U), kwargs, self.device_ids)
            for input_s, kw_s in zip(inputs_scatter, kwargs):  # GPU numbers
                # FIXME other perturbations are not supported yet
                assert isinstance(x.ptb, PerturbationLpNorm)
                ptb = PerturbationLpNorm(norm=x.ptb.norm, eps=x.ptb.eps, x_L=input_s[1], x_U=input_s[2])
                kw_s['x'] = list(kw_s['x'])
                kw_s['x'][0] = BoundedTensor(kw_s['x'][0], ptb)
                kw_s['x'] = (kw_s['x'])
                bounded_inputs.append(tuple(input_s[0], ))
        else:
            # normal forward
            inputs_scatter, kwargs = self.scatter(inputs, kwargs, self.device_ids)
            bounded_inputs = inputs_scatter

        if len(self.device_ids) == 1:
            return self.module(*bounded_inputs[0], **kwargs[0])
        outputs = self.parallel_apply(self._replicas[:len(bounded_inputs)], bounded_inputs, kwargs)
        return self.gather(outputs, self.output_device)

    @staticmethod
    def get_property(model, node_class=None, att_name=None, node_name=None):
        if node_name:
            # Find node by name
            # FIXME If we use `model.named_modules()`, the nodes have the
            # `BoundedModule` type rather than bound nodes.
            for node in model._modules.values():
                if node.name == node_name:
                    return getattr(node, att_name)
        else:
            # Find node by class
            for _, node in model.named_modules():
                # Find the Exp neuron in computational graph
                if isinstance(node, node_class):
                    return getattr(node, att_name)

    def state_dict(self, destination=None, prefix='', keep_vars=False):
        # add 'module.' here before each keys in self.module.state_dict() if needed
        return self.module.state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars)

    def _named_members(self, get_members_fn, prefix='', recurse=True, remove_duplicate: bool = True):
        return self.module._named_members(get_members_fn, prefix, recurse, remove_duplicate)

    def __getitem__(self, name):
        return self.module[name]


================================================
FILE: auto_LiRPA/bound_op_map.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
from .bound_ops import *

bound_op_map = {
    'onnx::Gemm': BoundLinear,
    'prim::Constant': BoundPrimConstant,
    'grad::Concat': BoundConcatGrad,
    'grad::Relu': BoundReluGrad,
    'grad::Conv2d': BoundConv2dGrad,
    'grad::Slice': BoundSliceGrad,
    'grad::Sqr': BoundSqr,
    'grad::jacobian': BoundJacobianOP,
    'grad::Tanh': BoundTanhGrad,
    'grad::Sigmoid': BoundSigmoidGrad,
    'custom::Gelu': BoundGelu,
    'onnx::Clip': BoundHardTanh
}

def register_custom_op(op_name: str, bound_obj: Bound) -> None:
    bound_op_map[op_name] = bound_obj

def unregister_custom_op(op_name: str) -> None:
    bound_op_map.pop(op_name)


================================================
FILE: auto_LiRPA/bound_ops.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
from .operators import *


================================================
FILE: auto_LiRPA/bounded_tensor.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
import copy
import torch.nn as nn
from torch import Tensor
import torch._C as _C


class BoundedTensor(Tensor):
    @staticmethod
    # We need to override the __new__ method since Tensor is a C class
    def __new__(cls, x, ptb=None, *args, **kwargs):
        if isinstance(x, Tensor):
            tensor = super().__new__(cls, [], *args, **kwargs)
            tensor.data = x.data
            tensor.requires_grad = x.requires_grad
            return tensor
        else:
            return super().__new__(cls, x, *args, **kwargs)

    def __init__(self, x, ptb=None):
        self.ptb = ptb

    def __repr__(self):
        if hasattr(self, 'ptb') and self.ptb is not None:
            return '<BoundedTensor: {}, {}>'.format(super().__repr__(), self.ptb.__repr__())
        else:
            return '<BoundedTensor: {}, no ptb>'.format(super().__repr__())

    def clone(self, *args, **kwargs):
        tensor = BoundedTensor(super().clone(*args, **kwargs), copy.deepcopy(self.ptb))
        return tensor

    def _func(self, func, *args, **kwargs):
        temp = func(*args, **kwargs)
        new_obj = BoundedTensor([], self.ptb)
        new_obj.data = temp.data
        new_obj.requires_grad = temp.requires_grad
        return new_obj

    # Copy to other devices with perturbation
    def to(self, *args, **kwargs):
        # FIXME add a general "to" function in perturbation class, not here.
        if hasattr(self.ptb, 'x_L') and isinstance(self.ptb.x_L, Tensor):
            self.ptb.x_L = self.ptb.x_L.to(*args, **kwargs)
        if hasattr(self.ptb, 'x_U') and isinstance(self.ptb.x_U, Tensor):
            self.ptb.x_U = self.ptb.x_U.to(*args, **kwargs)
        if hasattr(self.ptb, 'eps') and isinstance(self.ptb.eps, Tensor):
            self.ptb.eps = self.ptb.eps.to(*args, **kwargs)
        return self._func(super().to, *args, **kwargs)

    @classmethod
    def _convert(cls, ret):
        if cls is Tensor:
            return ret

        if isinstance(ret, Tensor):
            if True:
                # The current implementation does not seem to need non-leaf BoundedTensor
                return ret
            else:
                # Enable this branch if non-leaf BoundedTensor should be kept
                ret = ret.as_subclass(cls)

        if isinstance(ret, tuple):
            ret = tuple(cls._convert(r) for r in ret)

        return ret

    @classmethod
    def __torch_function__(cls, func, types, args=(), kwargs=None):
        if kwargs is None:
            kwargs = {}

        if not all(issubclass(cls, t) for t in types):
            return NotImplemented

        with _C.DisableTorchFunction():
            ret = func(*args, **kwargs)
            return cls._convert(ret)


class BoundedParameter(nn.Parameter):
    def __new__(cls, data, ptb, requires_grad=True):
        return BoundedTensor._make_subclass(cls, data, requires_grad)

    def __init__(self, data, ptb, requires_grad=True):
        self.ptb = ptb
        self.requires_grad = requires_grad

    def __deepcopy__(self, memo):
        if id(self) in memo:
            return memo[id(self)]
        else:
            result = type(self)(self.data.clone(), self.ptb, self.requires_grad)
            memo[id(self)] = result
            return result

    def __repr__(self):
        return 'BoundedParameter containing:\n{}\n{}'.format(
            self.data.__repr__(), self.ptb.__repr__())

    def __reduce_ex__(self, proto):
        raise NotImplementedError


================================================
FILE: auto_LiRPA/concretize_bounds.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
import torch

from .utils import eyeC
from .bound_ops import *
from .patches import Patches
from .perturbations import PerturbationLpNorm

from typing import TYPE_CHECKING
if TYPE_CHECKING:
    from .bound_general import BoundedModule


def concretize_bounds(
    self: 'BoundedModule',
    node,
    lower,
    upper,
    concretize_mode='backward',
    # for `backward_concretize`
    batch_size=None,
    output_dim=None,
    average_A=None,
    # for `forward_concretize`
    lw=None,
    uw=None,
    # common
    clip_neuron_selection_value=-1.0,
    clip_neuron_selection_type="ratio"
):
    """
    If neuron_selection_value >= 0, run an unconstrained/bounds-saving pass
    then a top-K constrained pass; otherwise just one pass.
    """
    # decide which underlying call to use
    def _call_concretize(use_constraints, save_bounds=False, heuristic_indices=None):
        if concretize_mode == 'backward':
            # backward concretize signature
            return backward_concretize(
                self, batch_size, output_dim, lower, upper,
                average_A=average_A,
                node_start=node,
                use_constraints=use_constraints,
                save_bounds=save_bounds,
                heuristic_indices=heuristic_indices,
            )
        elif concretize_mode == 'forward':
            # forward_concretize signature
            return forward_concretize(
                self, lower, upper, lw, uw,
                use_constraints=use_constraints,
                save_bounds=save_bounds,
                heuristic_indices=heuristic_indices,
            )
        else:
            raise ValueError(f"Unknown concretize mode: {concretize_mode}. "
                             "Please use 'backward' or 'forward'.")

    use_constraints = True
    save_bounds = False

    # If clip_neuron_selection_value >= 0, heuristic score-based topk selection is enabled.
    # And we will only apply constrained concretization on topk neurons based on their heuristics.
    # In this case, we'll need to 1) concretize all neurons without any constraints to get a looser bound 
    #                                           --> This is for computing the heuristics
    #                             2) concretize topk neurons with constraints.            
    #                                           --> This is for getting tighter bounds for topk neurons.                
    # In conclusion, if neuron_selection_value >= 0, use_consrtaints will be disabled first.
    # But for the output node in the computational graph we will directly concretize all neurons..
    if clip_neuron_selection_value >= 0 and node.name not in self.output_name:
        use_constraints = False
        # `output_activations` is the list of output activations from current pre-activation node.
        # This output_activations is manually assigned outside of auto_lirpa. Please check 
        #       complete_verifier/input_split/batch_branch_and_bound.py for more info.
        # If a node: 
        #       a) does not have any output_activation, and 
        #       b) heuristic topk selection is enabled, and
        #       c) is not the output node in the computational graph
        #  we will only compute naive bounds on it.
        # Otherwise, we'll need to do both step 1) and 2). And to accelarate step 2), we will save the bounds in 1).
        
        # If 1) this node has at least one output activation node
        #    2) at least one neuron will be selected
        # We will need to concretize with constraints, 
        if node.output_activations is not None and clip_neuron_selection_value > 0:
            save_bounds = True

    # If heuristic topk selection is enabled, this would be the step 1).
    new_lower, new_upper, has_constraints = _call_concretize(
        use_constraints=use_constraints,
        save_bounds=save_bounds,
    )

    # If heuristic topk selection is enabled, this if-branch would be the step 2).
    if (has_constraints
        and node.output_activations is not None
        and clip_neuron_selection_value > 0
        and node.name not in self.output_name):

        score = 0.0
        unstable_masks = False

        # loop through all the output activations to get a comprehensive unstable mask and heuristic score.
        # This output_activations is manually assigned outside of auto_lirpa.
        # Please check complete_verifier/input_split/batch_branch_and_bound.py
        for o_act_node in node.output_activations:
            score = score + o_act_node.compute_bound_improvement_heuristics(new_lower, new_upper)
            unstable_masks = unstable_masks | o_act_node.get_unstable_mask(new_lower, new_upper)
        score = score.flatten(1)                        # shape: (Batchsize, Hidden_dim)
        unstable_masks = unstable_masks.flatten(1)      # shape: (Batchsize, Hidden_dim)

        # Only do second concretize if there exists unstable neurons.
        if unstable_masks.any():
            max_unstable_size = unstable_masks.sum(dim=1).max()
            heuristic_indices = None
            # The K value in topk should be at least 1.
            if clip_neuron_selection_type == "ratio":
                K = max(int(max_unstable_size * clip_neuron_selection_value + 0.5), 1)
            else:
                K = min(clip_neuron_selection_value, max_unstable_size)
            _, heuristic_indices = torch.topk(score, k=K, dim=1, largest=True, sorted=False)
            new_lower, new_upper, _ = _call_concretize(
                use_constraints=True,
                heuristic_indices=heuristic_indices
            )
        else:
            # Previously we've stored to aux bounds, now it should be cleared to avoid any confusion.
            for root in self.roots():
                if (hasattr(root, 'perturbation')
                    and root.perturbation is not None
                    and isinstance(root.perturbation, PerturbationLpNorm)):
                    root.perturbation.clear_aux_bounds()

    return new_lower, new_upper


def concretize_root(self, root, batch_size, output_dim,
                    average_A=False, node_start=None, input_shape=None,
                    use_constraints=False, heuristic_indices=None, save_bounds=False): 
    # The last three optional argument are designed for heuristic-driven constrained concretization.
    # use_constraints:      A flag controling whether to enable constraints solving or not.
    # heuristic_indices:    A index tensor, it select EUQAL number of hidden neurons from each batch. 
    #                           Constrained solving will be further applied on these neurons. Shape (batchsize, n_h_neurons)
    # save_bounds:          A flag determining whether to save naive bounds (to avoid redundant computation)

    if average_A and isinstance(root, BoundParams):
        lA = root.lA.mean(
            node_start.batch_dim + 1, keepdim=True
        ).expand(root.lA.shape) if (root.lA is not None) else None
        uA = root.uA.mean(
            node_start.batch_dim + 1, keepdim=True
        ).expand(root.uA.shape) if (root.uA is not None) else None
    else:
        lA, uA = root.lA, root.uA
    if not isinstance(root.lA, eyeC) and not isinstance(root.lA, Patches):
        lA = root.lA.reshape(output_dim, batch_size, -1).transpose(0, 1) if (lA is not None) else None
    if not isinstance(root.uA, eyeC) and not isinstance(root.uA, Patches):
        uA = root.uA.reshape(output_dim, batch_size, -1).transpose(0, 1) if (uA is not None) else None
    
    has_constraints = False
    if hasattr(root, 'perturbation') and root.perturbation is not None:

        if isinstance(root.perturbation, PerturbationLpNorm):
            # Enable / Disable constraints solving according to `use_constraints`
            root.perturbation.constraints_enable = use_constraints
            if root.perturbation.constraints is not None:
                if self.infeasible_bounds_constraints is not None:
                    root.perturbation.add_infeasible_batches(self.infeasible_bounds_constraints)
                root.perturbation.add_objective_indices(heuristic_indices)
                has_constraints = True

        if isinstance(root, BoundParams):
            # add batch_size dim for weights node
            lb = root.perturbation.concretize(
                root.center.unsqueeze(0), lA, sign=-1, aux=root.aux
            ) if (lA is not None) else None
            ub = root.perturbation.concretize(
                root.center.unsqueeze(0), uA, sign=+1, aux=root.aux
            ) if (uA is not None) else None

        else:
            lb = root.perturbation.concretize(
                root.center, lA, sign=-1, aux=root.aux
            ) if lA is not None else None
            ub = root.perturbation.concretize(
                root.center, uA, sign=+1, aux=root.aux
            ) if uA is not None else None

        if (isinstance(root.perturbation, PerturbationLpNorm) 
            and root.perturbation.constraints is not None
            and root.perturbation.sorted_out_batches["infeasible_batches"] is not None):
            if self.infeasible_bounds_constraints is not None:
                self.infeasible_bounds_constraints = self.infeasible_bounds_constraints | root.perturbation.sorted_out_batches["infeasible_batches"]
            # else:
            #     self.infeasible_bounds_constraints = root.perturbation.sorted_out_batches["infeasible_batches"]

        # If required, save current (naive) bounds to prevent redundant computation next time concretize on the same node
        if isinstance(root.perturbation, PerturbationLpNorm) and root.perturbation.constraints is not None and save_bounds:
            root.perturbation.add_aux_bounds(lb, ub)
        elif isinstance(root.perturbation, PerturbationLpNorm):
        # Otherwise, always clear_aux_bounds to prevent confusion
            root.perturbation.clear_aux_bounds()

    else:
        fv = root.forward_value
        if type(root) == BoundInput:
            # Input node with a batch dimension
            batch_size_ = batch_size
        else:
            # Parameter node without a batch dimension
            batch_size_ = 1

        def concretize_constant(A):
            if isinstance(A, eyeC):
                return fv.view(batch_size_, -1)
            elif isinstance(A, Patches):
                return A.matmul(fv, input_shape=input_shape)
            elif type(root) == BoundInput:
                return A.matmul(fv.view(batch_size_, -1, 1)).squeeze(-1)
            else:
                return A.matmul(fv.view(-1, 1)).squeeze(-1)

        lb = concretize_constant(lA) if (lA is not None) else None
        ub = concretize_constant(uA) if (uA is not None) else None

    return lb, ub, has_constraints


def backward_concretize(self, batch_size, output_dim, lb=None, ub=None,
               average_A=False, node_start=None, 
               use_constraints=False, heuristic_indices=None, save_bounds=False):
    # The last three optional argument are designed for heuristic-driven constrained concretization.
    # use_constraints:      A flag controling whether to enable constraints solving or not.
    # heuristic_indices:    A index tensor, it select EUQAL number of hidden neurons from each batch. 
    #                           Constrained solving will be further applied on these neurons. Shape (batchsize, n_h_neurons)
    # save_bounds:          A flag determining whether to save naive bounds (to avoid redundant computation)
    roots = self.roots()
    if isinstance(lb, torch.Tensor) and lb.ndim > 2:
        lb = lb.reshape(lb.shape[0], -1)
    if isinstance(ub, torch.Tensor) and ub.ndim > 2:
        ub = ub.reshape(ub.shape[0], -1)

    def add_b(b1, b2):
        if b2 is None:
            return b1
        elif b1 is None:
            return b2
        # Check if b1 is a tensor and if all its elements are infinity
        if torch.is_tensor(b1) and torch.isinf(b1).all():
            return b1
        # Check if b2 is a tensor and if all its elements are infinity
        if torch.is_tensor(b2) and torch.isinf(b2).all():
            return b2
        else:
            return b1 + b2

    has_constraints = False
    for root in roots:
        root.lb = root.ub = None
        if root.lA is None and root.uA is None:
            continue
        root.lb, root.ub, has_constraints_this_root = self.concretize_root(
            root, batch_size, output_dim, average_A=average_A,
            node_start=node_start, input_shape=roots[0].center.shape,
            use_constraints=use_constraints, heuristic_indices=heuristic_indices, save_bounds=save_bounds)

        has_constraints = has_constraints | has_constraints_this_root

        lb = add_b(lb, root.lb)
        ub = add_b(ub, root.ub)

    return lb, ub, has_constraints


def forward_concretize(self, lower, upper, lw, uw, use_constraints=False, heuristic_indices=None, save_bounds=False):
    """
    Concretize function for forward bound. 

    :param lower:                   Tensor. Intermediate layer lower bounds.
    :param upper:                   Tensor. Intermediate layer upper bounds.
    :param lw:                      Tensor. Intermediate layer lower A matrix.
    :param uw:                      Tensor. Intermediate layer upper A matrix.
    :param use_constraints:         bool. A flag controling whether to enable constraints solving or not.
        If heuristic ratio is set, the first concretization run should disbale constraints solving.
    :param heuristic_indices:       Index Tensor. A index tensor, it select **equal** number of hidden neurons from each batch.
        Constrained solving will be further applied on these neurons. Shape (batchsize, n_h_neurons)
    :param save_bounds:             bool. A flag controling whether to save naive bounds.
    
    :return res_lower:              Tensor. The lower bound tensor.
    :return res_upper:              Tensor. The upper bound tensor.
    :return has_constraints:        bool. Whether constraints has been stored.
    """
    res_lower = 0.0
    res_upper = 0.0
    prev_dim_in = 0
    has_constraints = False
    roots = self.roots()
    assert (lw.ndim > 1)
    lA = lw.reshape(self.batch_size, self.dim_in, -1).transpose(1, 2)
    uA = uw.reshape(self.batch_size, self.dim_in, -1).transpose(1, 2)
    for root in roots:
        if hasattr(root, 'perturbation') and root.perturbation is not None:
            _lA = lA[:, :, prev_dim_in : (prev_dim_in + root.dim)]
            _uA = uA[:, :, prev_dim_in : (prev_dim_in + root.dim)]

            if isinstance(root.perturbation, PerturbationLpNorm):
                root.perturbation.constraints_enable = use_constraints
                if root.perturbation.constraints is not None:
                    if self.infeasible_bounds_constraints is not None:
                        root.perturbation.add_infeasible_batches(self.infeasible_bounds_constraints)
                    root.perturbation.add_objective_indices(heuristic_indices)
                    has_constraints = True                 

            # Previously added concretized bounds directly to lower/upper.
            # Now extract them first for reuse (e.g., in aux_bounds).
            temp_lower = root.perturbation.concretize(
                root.center, _lA, sign=-1, aux=root.aux
                ).view(lower.shape)
            temp_upper = root.perturbation.concretize(
                root.center, _uA, sign=+1, aux=root.aux
                ).view(upper.shape)
            
            # Update infeasible_batches
            if (isinstance(root.perturbation, PerturbationLpNorm)
                and root.perturbation.constraints is not None 
                and root.perturbation.sorted_out_batches["infeasible_batches"] is not None):
                if self.infeasible_bounds_constraints is not None:
                    self.infeasible_bounds_constraints = self.infeasible_bounds_constraints | root.perturbation.sorted_out_batches["infeasible_batches"]
                # else:
                #     self.infeasible_bounds_constraints = root.perturbation.sorted_out_batches["infeasible_batches"]

            # If required, save current (naive) bounds to prevent redundant computation next time concretize on the same node
            if isinstance(root.perturbation, PerturbationLpNorm) and root.perturbation.constraints is not None and save_bounds:
                root.perturbation.add_aux_bounds(temp_lower, temp_upper)
            elif isinstance(root.perturbation, PerturbationLpNorm):
            # Otherwise, always clear_aux_bounds to prevent confusion
                root.perturbation.clear_aux_bounds()

            # Now the concretization result from this root will be accumulated into final bounds.
            # Here we add temp_lower onto res_lower, instead of lower. 
            # It's because the lower value will be used twice, any modification to it should be avoided.
            res_lower = res_lower + temp_lower
            res_upper = res_upper + temp_upper                        
    
    res_lower = res_lower + lower
    res_upper = res_upper + upper
    return res_lower, res_upper, has_constraints


================================================
FILE: auto_LiRPA/concretize_func.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
import torch

from math import floor, ceil
from .utils import eyeC

# Declaration of the shape naming:

# B / batchsize  : The number of batches. In this `concretize_func.py`, if a tensor has batch dimension, we assume
#                    it will only be the first dimention of this tensor . That is: B = tensor.shape[0]
# 
# B_act          : The number of active batches. We will only apply constraints to a subset of batches, and these
#                   batches are called active batches. B_act <= B. There are two cases:
#                       -- When `no_return_inf` mode is disabled, we will keep B_act static throughout the entire 
#                           BaB iteration. In this case, B_act equals the number of batches not fully covered by 
#                           constraints, as determined by `sort_out_constr_batches` function.
#                       -- When `no_return_inf` mode is enabled, then B_act decreases over iterations, since more
#                           batches will be marked as infeasible. See `PerturbationLpNorm.add_infeasible_batches`.
# 
# X / x_dim      : The number of input neurons (batch dimension excluded). It stands for the input shape of the
#                   neural network. For tensors such as x0, epsilon, x_U, x_L, X = prod(*tensor.shape[1:])
# 
# H / hidden_dim : The number of hidden neurons (batch and input dimension excluded). It stands for the output
#                   shape of this hidden layer. For the objective A tensor, there are two cases:
#                       -- The tensor has batch dimention: H = tensor.view(B, -1, X).shape[1]
#                       -- The tensor does not have batch dimention: H = tensor.view(-1, X).shape[0]
# 
# H_act          : The number of active batches. We may only apply constraints to a subset of hidden neurons,
#                   and these neurons are called active neurons. H_act <= H.
#
# N_constr       : The number of constraints. For constraints_A matrix:
#                       -- In `sort_out_constr_batches` function, its shape is (B, N_constr, X)
#                       -- In `constraints_solving` function, its shape is (B_act, N_constt, X)

def construct_constraints(constr_A: torch.Tensor, constr_b: torch.Tensor, constr_rhs: torch.Tensor,
                            batchsize, x_dim, sign=1):
    r"""
    Construct the constraints tuple. This function provides a unified interface to generate this tuple.
    All the users should carefully read this function to fully understand the standard form of constraints.

    The first three argument expresses the non-standard form of the constraints:
                                    A @ x + b <= rhs
    We will first convert it into the standard form:
                                    A @ x + b' <= 0
    The the standard expression of constraints should be (constr_A, constr_b')

    Args:
        constr_A:   The coefficient A matrix of constraints.
                        It should be able to be reshaped into: (B, N_constr, X)
        constr_b:   The bias term of constraints.        
                        It should be able to be reshaped into: (B, N_constr)
        constr_rhs: The right-hand-side term of constraints.        
                        It should be able to be reshaped into: (B, N_constr)
        batchsize:  The batchsize B.
        x_dim:      The input dimension X (batchsize dimension excluded)
    """
    constr_A = sign * constr_A.reshape((batchsize, -1, x_dim))
    if constr_rhs is not None and not torch.all(constr_rhs == 0):
        constr_b = sign * (constr_b - constr_rhs).reshape((batchsize, -1))
    else:
        constr_b = sign * constr_b.reshape((batchsize, -1))
    return (constr_A, constr_b)

def _sort_out_constraints(A, b, x0, epsilon):
    r"""
    Filter out some batches with constraints not intersecting with input region

    Args:
        A (Tensor): A matrix of constraints with shape of (batchsize, n_constraints, x_dim)
        b (Tensor): Bias term of constraints with shape of (batchsize, n_constraints)
        x0 (Tensor): Centroid of the input space with shape of (batchsize, x_dim, 1)
        epsilon (Tensor): Offset from the centroid to the input space boundary with shape of (batchsize, x_dim, 1)
    Return:
        no_intersection (Tensor): A boolean tensor with shape (batchsize, ), indicating if certain batch is infeasible
            because a constraint does not intersect with input space
        fully_covered (Tensor): A boolean tensor with shape (batchsize, ), indicating if all the constraints in a certain 
            batch fully covers the corresponding input region. In this case, we can simply treat the batch as if it has no constraints
    """
    # minimal and maximal value of A*x + b
    x0_term = A.bmm(x0).squeeze(-1) + b        # shape: (B, N_constr)
    eps_term = A.abs().bmm(epsilon).squeeze(-1) # shape: (B, N_constr)
    minimal_val = x0_term - eps_term            # shape: (B, N_constr)
    maximal_val = x0_term + eps_term            # shape: (B, N_constr)

    # for any constrains: A * x + b <= 0,
    # if its min(A * x + b) > 0, it has no intersection with x0 +- epsilon
    # if its max(A * x + b) <= 0, it fully covers x0 +- epsilon 
    no_intersection = (minimal_val > 0).any(1)  # shape: (B, )
    if not no_intersection.any():
        no_intersection = None
    fully_covered = (maximal_val <= 0).all(1)   # shape: (B, )
    return no_intersection, fully_covered

@torch.jit.script
def _dist_rearrange(constraints_A, constraints_b, x0):
    r"""
    Reorder the constraints according to their distance to x_prime

    Args:
        constraints_A (Tensor): A matrix of constraints with shape of (batchsize, n_constraints, x_dim)
        constraints_b (Tensor): Bias term of constraints with shape of (batchsize, n_constraints)
        x0 (Tensor): x0 tensor with shape of (batchsize, x_dim, 1). Based on the heuristic,
        this can be the input space centroid x0, or the original optimal point x_prime
    Return:
        rearranged_A (Tensor): Rearranged matrix of constraints with shape of (batchsize, n_constraints, x_dim)
        rearranged_b (Tensor): Bias term of constraints with shape of (batchsize, n_constraints)
    """
    # Compute the normalized, directional distance from x_prime to constraints hyper-plane.
    distance = (constraints_A.bmm(x0).squeeze(-1) + constraints_b) # shape: (B, N_constr)
    l2_norm  = constraints_A.norm(p=2, dim=-1)                     # shape: (B, N_constr)
    normed_dist = distance / l2_norm                               # shape: (B, N_constr)

    # Sort the constraints according to this distance.
    order = torch.sort(normed_dist, descending=True, dim=1)[1]
    order_expand = order.unsqueeze(-1).expand(-1, -1, constraints_A.size(-1))
    rearranged_A = constraints_A.gather(index=order_expand, dim=1)
    rearranged_b = constraints_b.gather(index=order, dim=1)
    return rearranged_A, rearranged_b

@torch.jit.script
def _solve_dual_var(constr_a, object_a, constr_d, epsilon, a_mul_e=None):
    r"""
    Solve the following optimization problem:

    Primal:         min_x   object_a^T x
                    s.t.    constr_a^T x + constr_d <= 0,
                            x0-epsilon <= x <= x0+epsilon

    Dual:           min_x max_beta  object_a^T x + beta * (constr_a^T x + constr_d)
                    s.t.            x0 - epsilon <= x <= x0 + epsilon
                                    beta >= 0

    Strong duality:
                    max_{beta >= 0} min_{x \in X} object_a^T x + beta * (constr_a^T x + constr_d)

    Dual norm:
                    max_{beta >= 0} - |object_a + beta * constr_a|^T epsilon + beta * (constr_a^T x0 + constr_d) + object_a^T x0

    Now the sole optimize problem is piece-wise linear, we just have to check each 
    turning point and the end points of beta (0 and +inf)

    Args:
        constr_a (Tensor): Constraint A matrix with shape of (batchsize, x_dim)
        object_a (Tensor): Objective A matrix with shape of (batchsize, h_dim, x_dim)
        constr_d (Tensor): Pre-computed bias term of constraint with shape of (batchsize, )
                    constr_d = constr_a^T x0 + constr_b
        epsilon (Tensor): Offset from the centroid to the input space boundary with shape of (batchsize, x_dim, 1)
    Return:
        optimal_beta (Tensor): The optimal beta value with shape of (batchsize, h_dim)
    """

    B_act = constr_a.size(0)
    H_act = object_a.size(1)
    device = constr_a.device
    dtype = constr_a.dtype

    # --- prepare fill-in tensors 
    zeros = torch.zeros((1, 1, 1), device=device, dtype=dtype).expand(B_act, H_act, 1)
    infs = torch.full((1, 1, 1), fill_value=torch.inf, dtype=dtype, device=device).expand(B_act, H_act, 1)

    a_reshape = constr_a.unsqueeze(1)                   # shape: (B_act, 1, X)
    epsilon_reshape = epsilon.view((B_act, 1, -1))      # shape: (B_act, 1, X)
    b_reshape = constr_d.view((-1, 1, 1))               # shape: (B_act, 1, 1)

    # q is the turning points of the piece-wise linear function.
    q = - object_a/a_reshape                            # shape: (B_act, H_act, X)
    # idx indicates the ascending order of these turning points.
    q_sort, idx = q.sort(dim=-1)                        # shape: (B_act, H_act, X) 

    # --- calculating the gradient w.r.t. beta within each interval ---
    a_mul_e = (a_reshape * epsilon_reshape).expand(-1, H_act, -1)   # (B_act, H_act, X)
    # a_mul_e = a_mul_e.expand(-1, H_act, -1)

    #               (B_act, H_act, X)       (B_act, H_act, X)
    a_sort = torch.gather(a_mul_e, dim=-1, index=idx)               # (B_act, H_act, X)

    a_neg_cumsum = a_sort.abs().cumsum(dim=-1)              # shape: (B_act, H, x_dim)
    a_neg_cumsum = torch.cat((zeros, a_neg_cumsum), dim=-1) # shape: (B_act, H_act, 1+X)
    a_pos_cumsum = a_neg_cumsum - a_neg_cumsum[:, :, -1:]   # shape: (B_act, H_act, 1+X)
    grad_beta = a_pos_cumsum + a_neg_cumsum - b_reshape     # shape: (B_act, H_act, 1+X)

    # Due to the non-increasing trait of grad_beta, if there is a turning point
    # then the gradient must change from positive to negative, and this turning point is the optimal beta.
    sign_change = torch.searchsorted(grad_beta, zeros, right=False)

    # It might be the case that grad_beta is always positive when beta > 0. 
    # This means the maximization object is ever-increasing, hence it is unbounded.
    # For this case, a inf value would be returned.

    # Following comes a case of sign_change where all the turning points q are positive:
    # (g stands for grad_beta, q stands for turing points)
    #    g[0] = 2       g[1] = 1       g[2] = -1       g[3] = -3   
    # 0 --------- q[0] --------- q[1] ----------- q[2] ----------- ... --------> +inf
    #                             ^
    #                      sign_change=2
    #
    # q should represent the interval endpoints, hence, need to pad the left and right end with 0 and inf separately.

    # cat shape: (B_act, H_act, 1+X+1)                   
    q_new = torch.cat((zeros, q_sort, infs), dim=-1)                                       # shape: (B_act, H_act, X+2)
    optimal_beta = torch.gather(q_new, dim=-1, index=sign_change).clamp(min=0).squeeze(-1) # shape: (B_act, H_act)

    return optimal_beta

def sort_out_constr_batches(x_L, x_U, constraints, rearrange_constraints=False, no_return_inf=False):
    r"""
    Filter and preprocess input batches based on constraint feasibility.

    This function examines which input regions 
        1) has no intersection with one of the constraints.
        2) is fully covered by the all the constraints.

    It also optionally rearranges constraint order for better numerical behavior,
    and converts the constraint form from `(A, b)` to `(A, d)` where `d = A @ x0 + b`.
    Here x0 means the centroid of the input region, that is x0 = (x_L + x_U) / 2.
    
    Args:
        x_L (Tensor): Lower bound of input box, shape (B, *).
        x_U (Tensor): Upper bound of input box, shape (B, *).
        constraints (Tuple[Tensor, Tensor] or None): 
            A tuple `(A, b)` representing per-batch linear constraints.
            - `A`: shape (B, N_constr, X)
            - `b`: shape (B, N_constr)
            If None or empty, the function returns early.
        rearrange_constraints (bool): 
            Whether to rearrange constraints for better solver performance. Default: False.
        no_return_inf (bool): 
            If True, infeasible batches will be excluded from `active_indices`.
            Otherwise, infeasible batches are still marked active. Default: False.

    Returns:
        constraints (Optional[Tuple[Tensor, Tensor]]): 
            Filtered and reshaped constraint tuple `(A, d)` for active batches only.
            - `A`: shape (B_active, N_constr, X)
            - `d`: shape (B_active, N_constr)
            If all batches are fully covered, returns None.

        sorted_out_batches (dict): Diagnostic and filtering info with keys:
            - 'infeasible_batches' (BoolTensor): Shape (B,), True if batch has no feasible region.
                                                 If all the elements are False, it would be None. This would save space and time.
            - 'fully_covered' (BoolTensor): Shape (B,), True if batch is completely covered by constraints.
            - 'active_indices' (LongTensor): Indices of batches that are neither fully covered nor infeasible.
    """
    sorted_out_batches = None
    if constraints is None or constraints[0] is None or constraints[0].numel() == 0:
        return None, sorted_out_batches

    # Read argument and some necessary reshape
    assert x_L is not None and x_U is not None, "If constrained concretize is enabled, x_L and x_U cannot be None!"
    x0 = (x_L + x_U) / 2
    epsilon = (x_U - x_L) / 2
    constraints_A, constraints_b = constraints
    batch_size = x0.shape[0]
    x_dim = x0[0].numel()
    x0 = x0.view((batch_size, x_dim, 1))                        # shape: (B, X, 1)
    epsilon = epsilon.view((batch_size, x_dim, 1))              # shape: (B, X, 1)

    no_intersection, fully_covered = _sort_out_constraints(constraints_A, constraints_b, x0, epsilon)
    if fully_covered.all():
        print("All the added constraints fully cover the input space. No need to apply constraints .")
        return None, sorted_out_batches
    sorted_out_batches = {}
    sorted_out_batches["infeasible_batches"] = no_intersection
    # If there's no infeasible batch, simply set it to be None. 
    # This will provide a shortcut when update the infeasible_batches vector.
    # When batchsize is large and NN model has a lot of perturbed roots, this can save us some time.
    sorted_out_batches["fully_covered"] = fully_covered
    active_mask = ~fully_covered
    if no_intersection is not None and no_return_inf:
        active_mask = ~no_intersection & active_mask
    active_indices = torch.nonzero(active_mask, as_tuple=True)[0]
    sorted_out_batches["active_indices"] = active_indices

    # Now constraints tuple only contains active constraints, shape change: (B, N_Constr, X) -> (B_act, N_constr, X)
    constraints_A = constraints_A[active_indices]   # shape: (B_act, N_Constr, X)
    constraints_b = constraints_b[active_indices]   # shape: (B_act, N_Constr)
    active_x0 = x0[active_indices]
    if rearrange_constraints:
        constraints_A, constraints_b = _dist_rearrange(constraints_A, constraints_b, active_x0)
    # Also, we will replace the constraint_b term with constraints_d term.
    # For the usage of constraints_d, please check _solve function and constraints_solving function.
    constraints_d = torch.einsum('bkx, bxo->bk', constraints_A, active_x0) + constraints_b    # shape: (B_act, N_Constr)
    # Only store the constraints for active batches.
    constraints = (constraints_A, constraints_d)

    return constraints, sorted_out_batches

def constraints_solving(
    x_L, x_U, objective, constraints, sign=-1.0,
    sorted_out_batches={}, objective_indices=None, 
    constraints_enable=True, no_return_inf=False,
    max_chunk_size=None, safety_factor=0.8, solver_memory_factor=2.0,
    timer=None, 
    aux_bounds=None, 
    x0=None, epsilon=None, 
    act_x0=None, act_eps=None,
    use_grad=True
    ):
    r"""
    Combined constraint solving function with conditional logic based on objective shape.

    - If objective is eyeC or broadcastable (shape[0]=1), uses a vectorized,
        auto-chunked approach.
    - If objective has batch dim matching input (shape[0]=N_batch), uses the
        original approach (repeating inputs, no chunking).

    Solves LP: max / min A_t * x, s.t. A_c * x + b_c <= 0, x_L <= x <= x_U

    Args:
        x_L, x_U (Tensor)               : Input bounds tensors.
        objective (Tensor)              : Target coefficients (Tensor or eyeC).
            - Tensor shape: (H, X), (1, H, X), or (N_batch, H, X).
            - eyeC: Represents identity matrix.W
        constraints (tuple, optional)   : Tuple (A_c, d_c) or None.
        sign (float, optional)          : -1.0 for lower bound, +1.0 for upper bound.
        sorted_out_batches (dict, optional): Dict with pre-filtered batch masks. Please check `sort_out_constr_batches` for more info.
        constraints_enable (bool, optional): Flag for enabling constraints solving, this is set for heuristic hybrid solving, should be True by default.
        no_return_inf (bool, optional)  :  Flag for returning inf value. If true, this function will return inf for all the infeasible subproblems.
                        Otherwise, return naive bounds for infeasible ones.
        max_chunk_size, safety_factor, solver_memory_factor: Params for chunking memory.
                max_chunk_size:
                        A hard upper limit on the number of problems to be processed in a single
                        chunk, regardless of available memory. If set to an integer, the
                        auto-calculated chunk size will not exceed this value.
                        Use Case: Prevents the solver from creating a single, massive chunk that
                        could cause system unresponsiveness, even if memory is technically
                        available. Set to None to allow the function to use its own dynamic
                        calculation.
                safety_factor:
                        A float between 0.0 and 1.0 that specifies what fraction of the free
                        GPU memory should be considered "usable" for the calculation. For example,
                        a value of 0.8 means the function will only use 80% of the available
                        free memory as its budget.
                        Use Case: This buffer helps prevent "Out of Memory" (OOM) errors by
                        accounting for memory fragmentation, memory used by other processes, or
                        overhead from the CUDA driver itself. A lower value is safer but may
                        result in smaller chunks and thus slower overall processing.
                solver_memory_factor:
                        A heuristic multiplier used to estimate the memory consumed by the
                        iterative solver loop. The theoretical memory usage is multiplied by
                        this factor to create a more realistic estimate.
                        Use Case: The exact memory allocated for intermediate tensors and
                        computations within the solver can be complex to predict perfectly. This
                        factor provides a "fudge factor" to pad the memory estimation, ensuring
                        that the dynamically created tensors inside the solver loop do not cause
                        an OOM error. Adjust this if you consistently face memory issues during
                        the solver phase.
        objective_indices (Tensor, optional): Indices tensor of shape (N_batch, H_active) indicating
                            which objectives to compute. If None, all are computed.
        timer: Optional Timer object.
        aux_bounds (Tensor, optional)   : When hybrid constraint solving is enbaled, constrains_solving function will be called twice.
                                       For its second run, we will load the result from the first run to save time computing naive results.
        x0, eps (Tensor, optional)      : x0 and epsilon to solve on. 
                                    Without these two, we can still compute x0 and eps out of x_L and x_U.
        act_x0, act_eps (Tensor, optional): Active x0 and epsilon to solve on.
        use_grad (bool, optional): If False, the main computation is wrapped in
                                    `torch.no_grad()` for better performance and lower
                                    memory usage. Set to True only when gradients are
                                    required (e.g., for clip during alpha crown). Defaults to True.

    Returns:
        bound (Tensor): Computed bounds (N_batch, H, 1).
        infeasible_batches (boolTensor, optional) : If no_return_inf is True, `infeasible_batches` will be returned.
                                                    It is a boolean tensor with shape (batch_size, ), with True indictating the batch is infeasible. 
    """
    if timer: timer.start('init')
    if timer: timer.start("concretize")

    device = x_L.device
    N_batch = x_L.size(0)

    epsilon = (x_U - x_L) / 2.0 if epsilon is None else epsilon
    x0 = (x_U + x_L) / 2.0 if x0 is None else x0
    epsilon = epsilon.reshape((N_batch, -1, 1))
    x0 = x0.reshape((N_batch, -1, 1))

    is_eyeC = isinstance(objective, eyeC)

    # --- Naive Case (No Constraints) ---
    no_constraints_condition = (constraints is None) or (constraints[0].numel() == 0)
    if no_constraints_condition or (not constraints_enable):
        if is_eyeC:
            solved_obj = x0 + sign * epsilon                                    # Shape: (N_batch, X, 1)
        else:
            base_term = torch.einsum('bhx,bxo->bho', objective, x0)             # Shape: (N_batch, X, 1)
            eps_term = torch.einsum('bhx,bxo->bho', objective.abs(), epsilon)   # Shape: (N_batch, X, 1)
            solved_obj = base_term + sign * eps_term # Shape: (N_batch, H, 1)
        if timer: timer.add("init")
        if timer: timer.add("concretize")
        if no_return_inf:
            return solved_obj, None
        else:
            return solved_obj

    with torch.set_grad_enabled(use_grad):
        is_broadcastable = False
        is_batch_specific = False
        H = -1 # Hidden dimension
        X = x0.size(1) # Input X dimension
        if is_eyeC:
            is_broadcastable = True
            H = X
            # Internally represent eyeC as identity matrix for broadcastable path.
            objective_tensor = torch.eye(X, device=device).unsqueeze(0) # Shape (1, X, X)
        else:
            if objective.shape[0] != N_batch:
                # objective comes in shape of (H, X) or (1, H, X).
                # It will be broadcasted to (B, H, X) later.
                # Currently, is_broadcastable is designed for relu-bab, which usually takes much gpu memory,
                # so is_broadcastable is also a control flag for objective chunking.
                is_broadcastable = True
            else:
                # objective comes in shape of (B, H, X).
                is_batch_specific = True
            H = objective.shape[1]
            objective_tensor = objective
            if objective.shape[2] != X: raise ValueError("Objective shape mismatch")

        # --- Constrained Case ---
        # --- Calculate Naive Bounds (used as default/fallback) ---
        naive_bounds = torch.zeros(N_batch, H, 1, device=device)
        if aux_bounds is not None:
            naive_bounds_all = aux_bounds.flatten(1).unsqueeze(-1)
        elif is_eyeC:
            naive_bounds_all = x0 + sign * epsilon # Shape (N_batch, X, 1) -> (N_batch, H, 1)
        elif is_broadcastable:
            # obj_tensor is (1, H, X)
            base_term_naive = torch.einsum('shx,bxo->bho', objective_tensor, x0)
            eps_term_naive = torch.einsum('shx,bxo->bho', objective_tensor.abs(), epsilon)
            naive_bounds_all = base_term_naive + sign * eps_term_naive # Shape (N_batch, H, 1)
        elif is_batch_specific:
            # obj_tensor is (N, H, X)
            base_term_naive = torch.einsum('bhx,bxo->bho', objective_tensor, x0)
            eps_term_naive = torch.einsum('bhx,bxo->bho', objective_tensor.abs(), epsilon)
            naive_bounds_all = base_term_naive + sign * eps_term_naive # Shape (N_batch, H, 1)
        else:
            raise RuntimeError("Internal logic error in naive bound calculation")
        naive_bounds = naive_bounds_all # Assign calculated bounds

        # Final bounds tensor initialized as naive bounds
        final_bounds = naive_bounds
        fill_value_inf = torch.tensor(torch.inf if sign == -1.0 else -torch.inf, device=device)

        # --- Initial Batch Filtering (Common Logic) ---
        active_indices = sorted_out_batches.get("active_indices", None)
        if active_indices is None:
            fully_covered = sorted_out_batches.get("fully_covered", torch.zeros(N_batch, dtype=torch.bool, device=device))
            active_batches_mask = ~fully_covered # Batches requiring solver
            if no_return_inf:
                infeasible_batches = sorted_out_batches.get("infeasible_batches", torch.zeros(N_batch, dtype=torch.bool, device=device))
                active_batches_mask = ~infeasible_batches & active_batches_mask
            active_indices = torch.nonzero(active_batches_mask, as_tuple=True)[0]
        B_act = active_indices.numel() # Number of batches needing the solver.
        if timer: timer.add('init') # Combined timing for setup.

        # --- Early Exit if No Active Batches ---
        if B_act == 0:
            print(f"Constrained concretize: No active batches after filtering.")
            # Ensure non-active parts have naive bounds before returning.
            # (already done above by initializing with naive/inf)
            if timer: timer.add("concretize")
            final_bounds = naive_bounds
            if no_return_inf:
                return final_bounds, None
            else:
                return final_bounds

        constraints_A, constraints_d = constraints
        n_constraints = constraints_A.size(1)

        # --- Dynamic Chunk Size Calculation ---
        if is_batch_specific:
            # If objective is batch-specific, we do not chunk it.
            num_chunks = 1
            final_chunk_size = B_act
        else:
            # This block dynamically estimates the optimal chunk size to maximize GPU
            # utilization while preventing out-of-memory (OOM) errors.
            calculated_chunk_size = B_act
            free_mem, total_mem = torch.cuda.mem_get_info()
            usable_mem = free_mem * safety_factor
            obj_dtype = objective.dtype
            dtype_size = torch.finfo(obj_dtype).bits // 8
            mem_constraints_per_item = (n_constraints * X + n_constraints) * dtype_size
            mem_x0eps_per_item = 2 * X * dtype_size
            mem_ori_c_per_item = H * X * dtype_size
            mem_dual_obj_per_item = H * dtype_size
            mem_solver_per_item_bh = H * (X + X + 1 + X + 1) * dtype_size * solver_memory_factor
            mem_masks_temps_per_item = H * 2 # approx
            mem_per_item_est = (mem_constraints_per_item + mem_x0eps_per_item +
                                mem_ori_c_per_item + mem_dual_obj_per_item +
                                mem_solver_per_item_bh + mem_masks_temps_per_item) * 5
            if mem_per_item_est > 0:
                estimated_max_chunk = max(1, floor(usable_mem / mem_per_item_est))
                calculated_chunk_size = min(B_act, estimated_max_chunk)
            if max_chunk_size is not None and max_chunk_size > 0:
                final_chunk_size = min(calculated_chunk_size, max_chunk_size)
            else:
                final_chunk_size = calculated_chunk_size
            final_chunk_size = max(1, final_chunk_size) # Ensure chunk size is at least 1.
            num_chunks = ceil(B_act / final_chunk_size)

        if no_return_inf:
            # Initialize infeasible_batches boolean mask to be None at first.
            # If an infeasible batch does occur later, we will then initialize it to be a actual vector.
            infeasible_batches = None

        for i_chunk in range(num_chunks):
            # --- Handle size and idx for this chunk ---
            chunk_start_idx_rel = i_chunk * final_chunk_size
            chunk_end_idx_rel = min(chunk_start_idx_rel + final_chunk_size, B_act)
            current_chunk_size = chunk_end_idx_rel - chunk_start_idx_rel
            if current_chunk_size == 0: continue
            chunk_indices_abs = active_indices[chunk_start_idx_rel:chunk_end_idx_rel]

            # --- Get matrices for this chunk ---
            constr_A_mat = constraints_A[chunk_start_idx_rel:chunk_end_idx_rel]             # shape (B_act, n_constraints, X)
            constr_d_mat = constraints_d[chunk_start_idx_rel:chunk_end_idx_rel]             # shape (B_act, n_constraints)
            if act_x0 is not None:
                x0_mat = act_x0[chunk_start_idx_rel:chunk_end_idx_rel]
            else:
                x0_mat = x0[chunk_indices_abs]                                              # shape (B_act, X, 1)
            if act_eps is not None:
                eps_mat = act_eps[chunk_start_idx_rel:chunk_end_idx_rel]
            else:
                eps_mat = epsilon[chunk_indices_abs]                                        # shape (B_act, X, 1)

            if is_broadcastable:
                ori_c_mat = objective_tensor.expand(current_chunk_size, H, X).clone()
            else:
                ori_c_mat = objective_tensor[chunk_indices_abs].clone()             # shape: (B_act, H, X)

            if objective_indices is not None:                                       # shape: (B, H_act) 
                # Select the mask rows corresponding to the active batches in this chunk
                current_objective_indices = objective_indices[chunk_indices_abs]    # shape: (B_act, H_act)
                idx_unsqueeze = current_objective_indices.unsqueeze(-1)             # shape: (B_act, H_act, 1)
                idx_expand = idx_unsqueeze.expand(-1, -1, X)                        # shape: (B_act, H_act, X)
                ori_c_mat = ori_c_mat.gather(index=idx_expand, dim=1)               # shape: (B_act, H_act, X)

            obj_mat = ori_c_mat                                                             # shape (B_act, H_act, X)
            # Initialize dual part and base part
            # Note that the final minimal value is:
            #    objective^T x0 +                                                                base_part
            #    constr_d_0 * beta_0 + constr_d_1 * beta_1 + ... +                               dual_part 1
            #    - ( objective+ constr_a_0 * beta_0 + constr_a_1 * beta_1)^T epsilon             dual_part 2
            base_objective_term = torch.einsum('bhx,bxo->bh', obj_mat, x0_mat) # shape: (B_act, H_act)
            dual_objective_part = torch.zeros_like(base_objective_term)        # shape: (B_act, H_act)

            # --- Initialize State for Vectorized Loop (Chunk) ---
            if sign == 1.0: # Adjust for minimization problem solved by _solve
                obj_mat *= -1.0                                                # shape (B_act, H_act, X)
                base_objective_term *= -1.0

            # --- Vectorized Constraint Loop (Operating on Chunk) ---
            for k in range(n_constraints):
                constr_a_solve = constr_A_mat[:, k, :] # constraint A matrix shape (B_act, X)
                constr_d_solve = constr_d_mat[:, k]    # related bias term   shape (B_act,)
                epsilon_solve = eps_mat                # epsilon             shape (B_act, X)
                object_a_solve = obj_mat               # objective matrix    shape (B_act, H_act, X)

                with torch.no_grad():   # Otherwise, the gradients will mess up the alpha crown optimization.
                    optimal_beta = _solve_dual_var(constr_a_solve, object_a_solve, constr_d_solve, epsilon_solve) # shape (B_act, H_act)

                # Accumulation for the parentheses term in dual part 2
                obj_mat += optimal_beta.unsqueeze(-1) * constr_a_solve.unsqueeze(1)    # shape (B_act, H_act, X)        
                #            (B_act, H_act, 1)          (B_act, 1, X)
                # Accumulation of dual part 1
                dual_objective_part += optimal_beta * constr_d_solve.unsqueeze(1)      # shape (B_act, H_act)
                #                     (B_act, H_act)    (B_act, 1)

            # --- End of k loop ---
            # --- Final Objective Calculation for Unfinished Items in Chunk ---
            final_obj_abs = obj_mat.abs() # shape: (B_act, H_act, X)
            final_eps_mat = eps_mat       # shape: (B_act, X, 1)
            final_eps_term = torch.einsum('nhx,nxo->nh', final_obj_abs, final_eps_mat) # shape: (B_act, H_act)
            dual_objective_part -= final_eps_term   

            # --- Combine terms and handle mask ---
            final_obj_minimized = base_objective_term + dual_objective_part # shape: (B_act, H_act)
            if sign == 1.0: final_obj_optimal = -final_obj_minimized        # Flip sign back if maximizing.
            else: final_obj_optimal = final_obj_minimized

            # Previously we will handle infeasible batches after running through all the chunks, during processing final_bounds.
            # But that would require to create a copy of naive bounds
            # To save space and time, we will process final_obj_optimal
            final_obj_optimal = torch.nan_to_num(final_obj_optimal, nan=fill_value_inf.item(), posinf=fill_value_inf.item(), neginf=-fill_value_inf.item())
            if no_return_inf:
                infeasible_batches_chunk = final_obj_optimal.isinf().any(1)
                if infeasible_batches_chunk.any():
                    # Note that infeasible_batches was initialized as None
                    infeasible_batches = torch.full((N_batch, ), fill_value=False, device=device, dtype=torch.bool) if infeasible_batches is None else infeasible_batches
                    infeasible_batches[chunk_indices_abs] = infeasible_batches_chunk
                    # Set the bounds of infeasible batches to be naive bounds
                    infeasible_batches_chunk_indices_abs = chunk_indices_abs[infeasible_batches_chunk]
                    if objective_indices is not None:
                        naive_bounds_chunk = naive_bounds[infeasible_batches_chunk_indices_abs].squeeze(-1)
                        # Get the infeasible objective indices for this chunk.
                        current_infeasible_objective_indices = current_objective_indices[infeasible_batches_chunk]
                        final_obj_optimal[infeasible_batches_chunk] = torch.gather(naive_bounds_chunk, dim=1, index=current_infeasible_objective_indices)
                    else:
                        final_obj_optimal[infeasible_batches_chunk] = naive_bounds[infeasible_batches_chunk_indices_abs].squeeze(-1)

            # Put the result of this chunk back into the overall final bounds
            if objective_indices is not None:
                final_bounds_active_chunk = final_bounds[chunk_indices_abs]  
                final_bounds_active_chunk.scatter_(dim=1, index=idx_unsqueeze, src=final_obj_optimal.unsqueeze(-1))
                final_bounds[chunk_indices_abs] = final_bounds_active_chunk
            else:
                final_bounds[chunk_indices_abs] = final_obj_optimal.unsqueeze(-1)

        if no_return_inf:
            if timer: timer.add("concretize")
            return final_bounds, infeasible_batches
        else:
            if timer: timer.add("concretize")
            return final_bounds


================================================
FILE: auto_LiRPA/cuda/cuda_kernels.cu
================================================
#include <torch/extension.h>

#include <cuda.h>
#include <cuda_runtime.h>

#include <vector>

__global__ void cuda_double2float_rd_kernel(const double* __restrict__ inputs,
    float* __restrict__ outputs, const size_t tensor_size) {
  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < tensor_size) {
    outputs[idx] = __double2float_rd(inputs[idx]);
  }
}

__global__ void cuda_double2float_ru_kernel(const double* __restrict__ inputs,
    float* __restrict__ outputs, const size_t tensor_size) {
  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < tensor_size) {
    outputs[idx] = __double2float_ru(inputs[idx]);
  }
}

torch::Tensor cuda_double2float_forward(torch::Tensor input,
    const std::string direction) {
  auto total_elem = input.numel();
  auto output = torch::empty_like(input, torch::ScalarType::Float);

  const int threads = 1024;
  const int blocks = (total_elem + threads - 1) / threads;
  
  if (direction == "down") {
    cuda_double2float_rd_kernel<<<blocks, threads>>>(input.data<double>(), output.data<float>(), total_elem);
  }
  else {
    cuda_double2float_ru_kernel<<<blocks, threads>>>(input.data<double>(), output.data<float>(), total_elem);
  }
  return output;
}


================================================
FILE: auto_LiRPA/cuda/cuda_utils.cpp
================================================
#include <torch/extension.h>

#include <vector>

#define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")

torch::Tensor cuda_double2float_forward(
    torch::Tensor input, const std::string direction);

torch::Tensor double2float_foward(
    torch::Tensor input, const std::string direction) {
  TORCH_CHECK((direction == "down") || (direction == "up"), "Unsupported direction, must be down or up.");
  TORCH_CHECK(input.type().scalarType() == torch::ScalarType::Double, "This function only supports DoubleTensor as inputs.");
  CHECK_CUDA(input);
  return cuda_double2float_forward(input, direction);
}

/* 
 * Usage: double2float(tensor, direction)
 * "tensor" must be a DoubleTensor on GPU.
 * "direction" is a string, can be "up" (round up) or "down" (round down).
 */
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("double2float", &double2float_foward, "Convert double to float with rounding direction control (direction = 'up' or 'down').");
}


================================================
FILE: auto_LiRPA/cuda_utils.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
import os
import sys
import torch
from torch.utils.cpp_extension import load, BuildExtension, CUDAExtension
from setuptools import setup

class DummyCudaClass:
    """A dummy class with error message when a CUDA function is called."""
    def __getattr__(self, attr):
        if attr == "double2float":
            # When CUDA module is not built successfully, use a workaround.
            def _f(x, d):
                print('WARNING: Missing CUDA kernels. Please enable CUDA build by setting environment variable AUTOLIRPA_ENABLE_CUDA_BUILD=1 for the correct behavior!')
                return x.float()
            return _f
        def _f(*args, **kwargs):
            raise RuntimeError(f"method {attr} not available because CUDA module was not built.")
        return _f

if __name__ == "__main__" and len(sys.argv) > 1:
    # Build and install native CUDA modules that can be directly imported later
    print('Building and installing native CUDA modules...')
    setup(
        name='auto_LiRPA_cuda_utils',
        ext_modules=[CUDAExtension('auto_LiRPA_cuda_utils', [
            'auto_LiRPA/cuda/cuda_utils.cpp',
            'auto_LiRPA/cuda/cuda_kernels.cu'
        ])],
        cmdclass={'build_ext': BuildExtension.with_options()},
    )
    exit(0)

if torch.cuda.is_available() and os.environ.get('AUTOLIRPA_ENABLE_CUDA_BUILD', False):
    try:
        import auto_LiRPA_cuda_utils as _cuda_utils
    except:
        print('CUDA modules have not been installed')
        try:
            print('Building native CUDA modules...')
            code_dir = os.path.dirname(os.path.abspath(__file__))
            verbose = os.environ.get('AUTOLIRPA_DEBUG_CUDA_BUILD', None) is not None
            _cuda_utils = load(
                'cuda_utils', [os.path.join(code_dir, 'cuda', 'cuda_utils.cpp'), os.path.join(code_dir, 'cuda', 'cuda_kernels.cu')], verbose=verbose)
            print('CUDA modules have been built.')
        except:
            print('CUDA module build failure. Some features will be unavailable.')
            print('Please make sure the latest CUDA toolkit is installed in your system.')
            if verbose:
                print(sys.exc_info()[2])
            else:
                print('Set environment variable AUTOLIRPA_DEBUG_CUDA_BUILD=1 to view build log.')
            _cuda_utils = DummyCudaClass()
else:
    if os.environ.get('AUTOLIRPA_ENABLE_CUDA_BUILD', False):
        print('CUDA unavailable. Some features are disabled.')
    _cuda_utils = DummyCudaClass()

double2float = _cuda_utils.double2float

def test_double2float():
    # Test the double2float function.
    import time
    shape = (3,4,5)

    a = torch.randn(size=shape, dtype=torch.float64, device='cuda')
    a = a.transpose(0,1)

    au = _cuda_utils.double2float(a, "up")
    ad = _cuda_utils.double2float(a, "down")

    print(a.size(), au.size(), ad.size())

    a_flatten = a.reshape(-1)
    au_flatten = au.reshape(-1)
    ad_flatten = ad.reshape(-1)

    for i in range(a_flatten.numel()):
        ai = a_flatten[i].item()
        aui = au_flatten[i].item()
        adi = ad_flatten[i].item()
        print(adi, ai, aui)
        assert adi <= ai
        assert aui >= ai
    del a, au, ad, a_flatten, au_flatten, ad_flatten

    # Performance benchmark.
    for j in [1, 4, 16, 64, 256, 1024]:
        shape = (j, 512, 1024)
        print(f'shape: {shape}')
        t = torch.randn(size=shape, dtype=torch.float64, device='cuda')

        torch.cuda.synchronize()
        start_time = time.time()
        for i in range(10):
            tt = t.float()
        torch.cuda.synchronize()
        del tt
        pytorch_time = time.time() - start_time
        print(f'pytorch rounding time: {pytorch_time:.4f}')

        torch.cuda.synchronize()
        start_time = time.time()
        for i in range(10):
            tu = _cuda_utils.double2float(t, "up")
        torch.cuda.synchronize()
        del tu
        roundup_time = time.time() - start_time
        print(f'cuda round up time: {roundup_time:.4f}')

        torch.cuda.synchronize()
        start_time = time.time()
        for i in range(10):
            td = _cuda_utils.double2float(t, "down")
        torch.cuda.synchronize()
        del td
        rounddown_time = time.time() - start_time
        print(f'cuda round down time: {rounddown_time:.4f}')

        del t


if __name__ == "__main__":
    if len(sys.argv) == 1:
        # Some tests. It's not possible to test them automatically because travis does not have CUDA.
        test_double2float()


================================================
FILE: auto_LiRPA/edit_graph.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
"""Edit the computational graph in BoundedModule."""

from auto_LiRPA.bound_ops import Bound

from typing import TYPE_CHECKING
if TYPE_CHECKING:
    from .bound_general import BoundedModule


# Make sure the nodes already have `name` and `input_name`
def add_nodes(self: 'BoundedModule', nodes):
    # TODO check duplicate names
    nodes = [(node if isinstance(node, Bound) else node.bound_node)
                for node in nodes]
    for node in nodes:
        if node.name in self._modules:
            raise NameError(f'Node with name {node.name} already exists')
        self._modules[node.name] = node
        node.output_name = []
        if len(node.inputs) == 0:
            self.root_names.append(node.name)
    for node in nodes:
        for l_pre in node.inputs:
            l_pre.output_name.append(node.name)
        if (getattr(node, 'has_constraint', False) and
                node.name not in self.layers_with_constraint):
            self.layers_with_constraint.append(node.name)


def add_input_node(self: 'BoundedModule', node, index=None):
    self.add_nodes([node])
    self.input_name.append(node.name)
    # default value for input_index
    if index == 'auto':
        index = max([0] + [(i + 1)
                    for i in self.input_index if i is not None])
    self.input_index.append(index)


def delete_node(self: 'BoundedModule', node):
    for node_inp in node.inputs:
        node_inp.output_name.pop(node_inp.output_name.index(node.name))
    self._modules.pop(node.name)
    # TODO Create a list to contain all such lists such as
    # "relus" and "optimizable_activations"
    self.relus = [
        item for item in self.relus if item != node]
    self.optimizable_activations = [
        item for item in self.optimizable_activations if item != node]


def replace_node(self: 'BoundedModule', node_old, node_new):
    assert node_old != node_new
    for node in self.nodes():
        for i in range(len(node.inputs)):
            if node.inputs[i] == node_old:
                node.inputs[i] = node_new
    node_new.output_name += node_old.output_name
    if self.final_name == node_old.name:
        self.final_name = node_new.name
    for i in range(len(self.output_name)):
        if self.output_name[i] == node_old.name:
            self.output_name[i] = node_new.name
    self.delete_node(node_old)


================================================
FILE: auto_LiRPA/eps_scheduler.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
import random
from .utils import logger

class BaseScheduler(object):
    def __init__(self, max_eps, opt_str):
        self.parse_opts(opt_str)
        self.prev_loss = self.loss = self.max_eps = self.epoch_length = float("nan")
        self.eps = 0.0
        self.max_eps = max_eps
        self.is_training = True
        self.epoch = 0
        self.batch = 0

    def __repr__(self):
        return '<BaseScheduler: eps {}, max_eps {}>'.format(self.eps, self.max_eps)

    def parse_opts(self, s):
        opts = s.split(',')
        self.params = {}
        for o in opts:
            if o.strip():
                key, val = o.split('=')
                self.params[key] = val

    def get_max_eps(self):
        return self.max_eps

    def get_eps(self):
        return self.eps

    def reached_max_eps(self):
        return abs(self.eps - self.max_eps) < 1e-3

    def step_batch(self, verbose=False):
        if self.is_training:
            self.batch += 1
        return

    def step_epoch(self, verbose=False):
        if self.is_training:
            self.epoch += 1
        return

    def update_loss(self, new_loss):
        self.prev_loss = self.loss
        self.loss = new_loss

    def train(self):
        self.is_training = True

    def eval(self):
        self.is_training = False

    # Set how many batches in an epoch
    def set_epoch_length(self, epoch_length):
        self.epoch_length = epoch_length


class FixedScheduler(BaseScheduler):
    def __init__(self, max_eps, opt_str=""):
        super(FixedScheduler, self).__init__(max_eps, opt_str)
        self.eps = self.max_eps


class LinearScheduler(BaseScheduler):

    def __init__(self, max_eps, opt_str):
        super(LinearScheduler, self).__init__(max_eps, opt_str)
        self.schedule_start = int(self.params['start'])
        self.schedule_length = int(self.params['length'])
        self.epoch_start_eps = self.epoch_end_eps = 0

    def __repr__(self):
        return '<LinearScheduler: start_eps {:.3f}, end_eps {:.3f}>'.format(
            self.epoch_start_eps, self.epoch_end_eps)

    def step_epoch(self, verbose = True):
        self.epoch += 1
        self.batch = 0
        if self.epoch < self.schedule_start:
            self.epoch_start_eps = 0
            self.epoch_end_eps = 0
        else:
            eps_epoch = self.epoch - self.schedule_start
            if self.schedule_length == 0:
                self.epoch_start_eps = self.epoch_end_eps = self.max_eps
            else:
                eps_epoch_step = self.max_eps / self.schedule_length
                self.epoch_start_eps = min(eps_epoch * eps_epoch_step, self.max_eps)
                self.epoch_end_eps = min((eps_epoch + 1) * eps_epoch_step, self.max_eps)
        self.eps = self.epoch_start_eps
        if verbose:
            logger.info("Epoch {:3d} eps start {:7.5f} end {:7.5f}".format(self.epoch, self.epoch_start_eps, self.epoch_end_eps))

    def step_batch(self):
        if self.is_training:
            self.batch += 1
            eps_batch_step = (self.epoch_end_eps - self.epoch_start_eps) / self.epoch_length
            self.eps = self.epoch_start_eps + eps_batch_step * (self.batch - 1)
            if self.batch > self.epoch_length:
                logger.warning('Warning: we expect {} batches in this epoch but this is batch {}'.format(self.epoch_length, self.batch))
                self.eps = self.epoch_end_eps

class RangeScheduler(BaseScheduler):

    def __init__(self, max_eps, opt_str):
        super(RangeScheduler, self).__init__(max_eps, opt_str)
        self.schedule_start = int(self.params['start'])
        self.schedule_length = int(self.params['length'])

    def __repr__(self):
        return '<RangeScheduler: epoch [{}, {}]>'.format(
            self.schedule_start, self.schedule_start + self.schedule_length)

    def step_epoch(self, verbose = True):
        self.epoch += 1
        if self.epoch >= self.schedule_start and self.epoch < self.schedule_start + self.schedule_length:
            self.eps = self.max_eps
        else:
            self.eps = 0

    def step_batch(self):
        pass

class BiLinearScheduler(LinearScheduler):

    def __init__(self, max_eps, opt_str):
        super(BiLinearScheduler, self).__init__(max_eps, opt_str)
        self.schedule_start = int(self.params['start'])
        self.schedule_length = int(self.params['length'])
        self.schedule_length_half = self.schedule_length / 2
        self.epoch_start_eps = self.epoch_end_eps = 0

    def __repr__(self):
        return '<BiLinearScheduler: start_eps {:.5f}, end_eps {:.5f}>'.format(
            self.epoch_start_eps, self.epoch_end_eps)

    def step_epoch(self, verbose = True):
        self.epoch += 1
        self.batch = 0
        if self.epoch < self.schedule_start:
            self.epoch_start_eps = 0
            self.epoch_end_eps = 0
        else:
            eps_epoch = self.epoch - self.schedule_start
            eps_epoch_step = self.max_eps / self.schedule_length_half
            if eps_epoch < self.schedule_length_half:
                self.epoch_start_eps = min(eps_epoch * eps_epoch_step, self.max_eps)
                self.epoch_end_eps = min((eps_epoch + 1) * eps_epoch_step, self.max_eps)
            else:
                self.epoch_start_eps = max(0,
                    self.max_eps - ((eps_epoch - self.schedule_length_half) * eps_epoch_step))
                self.epoch_end_eps = max(0, self.epoch_start_eps - eps_epoch_step)
        self.eps = self.epoch_start_eps
        if verbose:
            logger.info("Epoch {:3d} eps start {:7.5f} end {:7.5f}".format(self.epoch, self.epoch_start_eps, self.epoch_end_eps))


class SmoothedScheduler(BaseScheduler):

    def __init__(self, max_eps, opt_str):
        super(SmoothedScheduler, self).__init__(max_eps, opt_str)
        # Epoch number to start schedule
        self.schedule_start = int(self.params['start'])
        # Epoch length for completing the schedule
        self.schedule_length = int(self.params['length'])
        # Mid point to change exponential to linear schedule
        self.mid_point = float(self.params.get('mid', 0.25))
        # Exponential
        self.beta = float(self.params.get('beta', 4.0))
        assert self.beta >= 2.
        assert self.mid_point >= 0. and self.mid_point <= 1.
        self.batch = 0


    # Set how many batches in an epoch
    def set_epoch_length(self, epoch_length):
        if self.epoch_length != self.epoch_length:
            self.epoch_length = epoch_length
        else:
            if self.epoch_length != epoch_length:
                raise ValueError("epoch_length must stay the same for SmoothedScheduler")

    def step_epoch(self, verbose = True):
        super(SmoothedScheduler, self).step_epoch()
        # FIXME
        if verbose == False:
            for i in range(self.epoch_length):
                self.step_batch()

    # Smooth schedule that slowly morphs into a linear schedule.
    # Code is based on DeepMind's IBP implementation:
    # https://github.com/deepmind/interval-bound-propagation/blob/2c1a56cb0497d6f34514044877a8507c22c1bd85/interval_bound_propagation/src/utils.py#L84
    def step_batch(self, verbose=False):
        if self.is_training:
            self.batch += 1
            init_value = 0.0
            final_value = self.max_eps
            beta = self.beta
            step = self.batch - 1
            # Batch number for schedule start
            init_step = (self.schedule_start - 1) * self.epoch_length
            # Batch number for schedule end
            final_step = (self.schedule_start + self.schedule_length - 1) * self.epoch_length
            # Batch number for switching from exponential to linear schedule
            mid_step = int((final_step - init_step) * self.mid_point) + init_step
            t = (mid_step - init_step) ** (beta - 1.)
            # find coefficient for exponential growth, such that at mid point the gradient is the same as a linear ramp to final value
            alpha = (final_value - init_value) / ((final_step - mid_step) * beta * t + (mid_step - init_step) * t)
            # value at switching point
            mid_value = init_value + alpha * (mid_step - init_step) ** beta
            # return init_value when we have not started
            is_ramp = float(step > init_step)
            # linear schedule after mid step
            is_linear = float(step >= mid_step)
            exp_value = init_value + alpha * float(step - init_step) ** beta
            linear_value = min(mid_value + (final_value - mid_value) * (step - mid_step) / (final_step - mid_step), final_value)
            self.eps = is_ramp * ((1.0 - is_linear) * exp_value + is_linear * linear_value) + (1.0 - is_ramp) * init_value

class AdaptiveScheduler(BaseScheduler):
    def __init__(self, max_eps, opt_str):
        super(AdaptiveScheduler, self).__init__(max_eps, opt_str)
        self.schedule_start = int(self.params['start'])
        self.min_eps_step = float(self.params.get('min_step', 1e-9))
        self.max_eps_step = float(self.params.get('max_step', 1e-4))
        self.eps_increase_thresh = float(self.params.get('increase_thresh', 1.0))
        self.eps_increase_factor = float(self.params.get('increase_factor', 1.5))
        self.eps_decrease_thresh = float(self.params.get('decrease_thresh', 1.5))
        self.eps_decrease_factor = float(self.params.get('decrease_factor', 2.0))
        self.small_loss_thresh = float(self.params.get('small_loss_thresh', 0.05))
        self.epoch = 0
        self.eps_step = self.min_eps_step

    def step_batch(self):
        if self.eps < self.max_eps and self.epoch >= self.schedule_start and self.is_training:
            if self.loss != self.loss or self.prev_loss != self.prev_loss:
                # First 2 steps. Use min eps step
                self.eps += self.min_eps_step
            else:
                # loss decreasing or loss very small. Increase eps step
                if self.loss < self.eps_increase_thresh * self.prev_loss or self.loss < self.small_loss_thresh:
                    self.eps_step = min(self.eps_step * self.eps_increase_factor, self.max_eps_step)
                # loss increasing. Decrease eps step
                elif self.loss > self.eps_decrease_thresh * self.prev_loss:
                    self.eps_step = max(self.eps_step / self.eps_decrease_factor, self.min_eps_step)
                # print("loss {:7.5f} prev_loss {:7.5f} eps_step {:7.5g}".format(self.loss, self.prev_loss, self.eps_step))
                # increase eps according to loss
                self.eps = min(self.eps + self.eps_step, self.max_eps)
            # print("eps step size {:7.5f}, eps {:7.5f}".format(self.eps_step, self.eps))


if __name__ == "__main__":
    s = SmoothedScheduler(0.1, "start=2,length=10,mid=0.3")
    epochs = 20
    batches = 10
    loss = 1.0
    eps = []
    s.set_epoch_length(batches)
    for epoch in range(1,epochs+1):
        s.step_epoch()
        for batch in range(1,batches+1):
            s.step_batch()
            loss = loss * (0.975 + random.random() / 20)
            eps.append(s.get_eps())
            print('epoch {:5d} batch {:5d} eps {:7.5f} loss {:7.5f}'.format(epoch, batch, s.get_eps(), loss))
            # update_loss is only necessary for adaptive eps scheduler
            s.update_loss(loss)
    # plot epsilon values
    import matplotlib
    matplotlib.use('Agg')
    from matplotlib import pyplot as plt
    plt.figure(figsize=(10,8))
    plt.plot(eps)
    plt.xticks(range(0, epochs*batches+batches, batches))
    plt.grid()
    plt.tight_layout()
    plt.savefig('epsilon.pdf')


================================================
FILE: auto_LiRPA/forward_bound.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
import torch
import warnings
from .bound_ops import *
from .utils import *
from .linear_bound import LinearBound
from .perturbations import PerturbationLpNorm

from typing import TYPE_CHECKING
if TYPE_CHECKING:
    from .bound_general import BoundedModule

import sys
sys.setrecursionlimit(1000000)


def forward_general(self: 'BoundedModule', C=None, node:'Bound'=None, concretize=False,
                    offset=0, from_node=False):

    if self.dynamic:
        return self.forward_general_dynamic(C=C, node=node, concretize=concretize, offset=offset)
    if C is None:
        if (hasattr(node, 'linear') and
            node.linear.lower is not None and node.linear.upper is not None):
            return node.linear.lower, node.linear.upper
        if not node.from_input:
            node.linear = LinearBound(None, node.value, None, node.value, node.value, node.value)
            return node.value, node.value
        if not node.perturbed:
            node.lower = node.upper = self.get_forward_value(node)
        if node.is_lower_bound_current():
            node.linear = LinearBound(None, node.lower, None, node.upper, node.lower, node.upper)
            return node.lower, node.upper

    for l_pre in node.inputs:
        if not hasattr(l_pre, 'linear'):
            self.forward_general(node=l_pre, offset=offset, from_node=from_node)
    inp = [l_pre.linear for l_pre in node.inputs]
    node._start = '_forward'
    if (C is not None and type(node) is BoundLinear and
            not node.is_input_perturbed(1) and not node.is_input_perturbed(2)):
        linear = node.bound_forward(self.dim_in, *inp, C=C)
        C_merged = True
    else:
        linear = node.linear = node.bound_forward(self.dim_in, *inp)
        C_merged = False

    lw, uw = linear.lw, linear.uw
    lower, upper = linear.lb, linear.ub

    # Combine linear bounds with C matrix
    if C is not None and not C_merged:
        # FIXME use bound_forward of BoundLinear
        C_pos, C_neg = C.clamp(min=0), C.clamp(max=0)
        # Flatten lw, uw for matrix multiplication
        lw = lw.reshape(self.batch_size, self.dim_in, -1)
        uw = uw.reshape(self.batch_size, self.dim_in, -1)
        _lw = torch.matmul(lw, C_pos.transpose(-1, -2)) + torch.matmul(uw, C_neg.transpose(-1, -2))
        _uw = torch.matmul(uw, C_pos.transpose(-1, -2)) + torch.matmul(lw, C_neg.transpose(-1, -2))
        lw, uw = _lw, _uw
        # Flatten lower, upper for matrix multiplication
        lower = lower.reshape(self.batch_size, -1)
        upper = upper.reshape(self.batch_size, -1)
        _lower = ( torch.matmul(lower.unsqueeze(1), C_pos.transpose(-1, -2)) 
                    + torch.matmul(upper.unsqueeze(1), C_neg.transpose(-1, -2)) )
        _upper = ( torch.matmul(upper.unsqueeze(1), C_pos.transpose(-1, -2))
                    + torch.matmul(lower.unsqueeze(1), C_neg.transpose(-1, -2)) )
        lower, upper = _lower.squeeze(1), _upper.squeeze(1)

    logger.debug(f'Forward bounds to {node}')

    if concretize:
        if lw is not None or uw is not None:
            lower, upper = self.concretize_bounds(
                node=node,
                lower=lower,
                upper=upper,
                concretize_mode='forward',
                lw=lw,
                uw=uw,
                clip_neuron_selection_value=self.clip_neuron_selection_value,
                clip_neuron_selection_type=self.clip_neuron_selection_type
            )

        linear.lower, linear.upper = lower, upper

        if C is None:
            node.linear = linear
            node.lower, node.upper = lower, upper

        if self.bound_opts['forward_refinement']:
            need_refinement = False
            for out in node.output_name:
                out_node = self[out]
                for i in getattr(out_node, 'requires_input_bounds', []):
                    if out_node.inputs[i] == node:
                        need_refinement = True
                        break
            if need_refinement:
                self.forward_refinement(node)
        return lower, upper


def forward_general_dynamic(self: 'BoundedModule', C=None, node:'Bound'=None,
                            concretize=False, offset=0):
    max_dim = self.bound_opts['forward_max_dim']

    if C is None:
        if hasattr(node, 'linear'):
            assert not concretize

            linear = node.linear
            if offset == 0:
                if linear.lw is None:
                    return linear
                elif linear.lw.shape[1] <= max_dim:
                    return linear
            if linear.lw is not None:
                lw = linear.lw[:, offset:offset+max_dim]
                x_L = linear.x_L[:, offset:offset+max_dim]
                x_U = linear.x_U[:, offset:offset+max_dim]
                tot_dim = linear.tot_dim
                if offset == 0:
                    lb = linear.lb
                else:
                    lb = torch.zeros_like(linear.lb)
            else:
                lw = x_L = x_U = None
                tot_dim = 0
                lb = linear.lb
            return LinearBound(
                lw, lb, lw, lb, x_L=x_L, x_U=x_U,
                offset=offset, tot_dim=tot_dim,
            )

        # These cases have no coefficient tensor
        if not node.from_input:
            if concretize:
                return node.value, node.value
            else:
                node.linear = LinearBound(
                    None, node.value, None, node.value, node.value, node.value)
                return node.linear
        if not node.perturbed:
            if not node.is_lower_bound_current():
                node.lower = node.upper = self.get_forward_value(node)
            if concretize:
                return node.lower, node.upper
            else:
                if offset > 0:
                    lb = torch.zeros_like(node.lower)
                else:
                    lb = node.lower
                node.linear = LinearBound(None, lb, None, lb, node.lower, node.upper)
                return node.linear

    if offset == 0:
        logger.debug(f'forward_general_dynamic: node={node}')

    inp = []
    for l_pre in node.inputs:
        linear_inp = self.forward_general_dynamic(node=l_pre, offset=offset)
        linear_inp.lower = l_pre.lower
        linear_inp.upper = l_pre.upper
        inp.append(linear_inp)
    node._start = '_forward'
    if (C is not None and isinstance(node, BoundLinear) and
            not node.is_input_perturbed(1) and not node.is_input_perturbed(2)):
        linear = node.bound_dynamic_forward(
            *inp, C=C, max_dim=max_dim, offset=offset)
        C_merged = True
    else:
        linear = node.bound_dynamic_forward(
            *inp, max_dim=max_dim, offset=offset)
        C_merged = False
    if offset > 0:
        linear.lb = linear.ub = torch.zeros_like(linear.lb)

    lw, lb, tot_dim = linear.lw, linear.lb, linear.tot_dim
    #logger.debug(f'forward_general_dynamic: node={node}, w_size={lw.shape[1]}, tot_dim={tot_dim}')

    if C is not None and not C_merged:
        # FIXME use bound_forward of BoundLinear
        lw = torch.matmul(lw, C.transpose(-1, -2))
        lb = torch.matmul(lb.unsqueeze(1), C.transpose(-1, -2)).squeeze(1)

    if concretize:
        lower = upper = lb
        if lw is not None:
            batch_size = lw.shape[0]
            assert (lw.ndim > 1)
            if lw.shape[1] > 0:
                A = lw.reshape(batch_size, lw.shape[1], -1).transpose(1, 2)
                ptb = PerturbationLpNorm(x_L=linear.x_L, x_U=linear.x_U)
                lower = lower + ptb.concretize(x=None, A=A, sign=-1).view(lb.shape)
                upper = upper + ptb.concretize(x=None, A=A, sign=1).view(lb.shape)
            offset_next = offset + max_dim
            more = offset_next < tot_dim
        else:
            more = False

        if C is None and offset == 0 and not more:
            node.linear = linear

        if more:
            if lw is not None and lw.shape[1] > 0:
                del A
                del ptb
                del lw
                del linear
            del inp
            # TODO make it non-recursive
            lower_next, upper_next = self.forward_general_dynamic(
                C, node, concretize=True, offset=offset_next)
            lower = lower + lower_next
            upper = upper + upper_next

        if C is None:
            node.lower, node.upper = lower, upper

        return lower, upper
    else:
        return linear


def clean_memory(self: 'BoundedModule', node):
    """ Remove linear bounds that are no longer needed. """
    # TODO add an option to retain these bounds
    for inp in node.inputs:
        if hasattr(inp, 'linear') and inp.linear is not None:
            clean = True
            for out in inp.output_name:
                out_node = self[out]
                if not (hasattr(out_node, 'linear') and out_node.linear is not None):
                    clean = False
            if clean:
                if isinstance(inp.linear, tuple):
                    for item in inp.linear:
                        del item
                delattr(inp, 'linear')


def forward_refinement(self: 'BoundedModule', node):
    """ Refine forward bounds with backward bound propagation
    (only refine unstable positions). """
    unstable_size_before = torch.logical_and(node.lower < 0, node.upper > 0).sum()
    if unstable_size_before == 0:
        return
    unstable_idx, unstable_size = self.get_unstable_locations(
        node.lower, node.upper, conv=isinstance(node, BoundConv))
    logger.debug(f'Forward refinement for {node}')
    batch_size = node.lower.shape[0]
    ret = self.batched_backward(
        node, C=None, unstable_idx=unstable_idx, batch_size=batch_size)
    self.restore_sparse_bounds(
        node, unstable_idx, unstable_size, node.lower, node.upper,
        new_lower=ret[0], new_upper=ret[1])
    unstable_size_after = torch.logical_and(node.lower < 0, node.upper > 0).sum()
    logger.debug(f'  Unstable neurons: {unstable_size_before} -> {unstable_size_after}')
    # TODO also update linear bounds?


def init_forward(self: 'BoundedModule', roots, dim_in):
    if dim_in == 0:
        raise ValueError("At least one node should have a specified perturbation")
    prev_dim_in = 0
    # Assumption: roots[0] is the input node which implies batch_size
    batch_size = roots[0].value.shape[0]
    for i in range(len(roots)):
        if hasattr(roots[i], 'perturbation') and roots[i].perturbation is not None:
            shape = roots[i].linear.lw.shape
            if self.dynamic:
                if shape[1] != dim_in:
                    raise NotImplementedError('Dynamic forward bound is not supported yet when there are multiple perturbed inputs.')
                ptb = roots[i].perturbation
                if (type(ptb) != PerturbationLpNorm or ptb.norm < np.inf
                        or ptb.x_L is None or ptb.x_U is None):
                    raise NotImplementedError(
                        'For dynamic forward bounds, only Linf (box) perturbations are supported, and x_L and x_U must be explicitly provided.')
                roots[i].linear.x_L = (
                    ptb.x_L_sparse.view(batch_size, -1) if ptb.sparse
                    else ptb.x_L.view(batch_size, -1))
                roots[i].linear.x_U = (
                    ptb.x_U_sparse.view(batch_size, -1) if ptb.sparse
                    else ptb.x_U.view(batch_size, -1))
            else:
                lw = torch.zeros(shape[0], dim_in, *shape[2:]).to(roots[i].linear.lw)
                lw[:, prev_dim_in:(prev_dim_in+shape[1])] = roots[i].linear.lw
                if roots[i].linear.lw.data_ptr() == roots[i].linear.uw.data_ptr():
                    uw = lw
                else:
                    uw = torch.zeros(shape[0], dim_in, *shape[2:]).to(roots[i].linear.uw)
                    uw[:, prev_dim_in:(prev_dim_in+shape[1])] = roots[i].linear.uw
                roots[i].linear.lw = lw
                roots[i].linear.uw = uw
            if i >= self.num_global_inputs:
                roots[i].forward_value = roots[i].forward_value.unsqueeze(0).repeat(
                    *([batch_size] + [1] * self.forward_value.ndim))
            prev_dim_in += shape[1]
        else:
            b = fv = roots[i].forward_value
            shape = fv.shape
            if roots[i].from_input:
                w = torch.zeros(shape[0], dim_in, *shape[1:], device=self.device)
                warnings.warn(f'Creating a LinearBound with zero weights with shape {w.shape}')
            else:
                w = None
            roots[i].linear = LinearBound(w, b, w, b, b, b)
            roots[i].lower = roots[i].upper = b

================================================
FILE: auto_LiRPA/interval_bound.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
import torch
from .bound_ops import *
from .utils import logger

from typing import TYPE_CHECKING
if TYPE_CHECKING:
    from .bound_general import BoundedModule


def IBP_general(self: 'BoundedModule', node=None, C=None,
                delete_bounds_after_use=False):

    logger.debug('IBP for %s', node)

    def _delete_unused_bounds(node_list: List[Bound]):
        """Delete bounds from input layers after use to save memory. Used when
        sparse_intermediate_bounds_with_ibp is true."""
        if delete_bounds_after_use:
            for n in node_list:
                del n.interval
                n.delete_lower_and_upper_bounds()

    if self.bound_opts.get('loss_fusion', False):
        res = self._IBP_loss_fusion(node, C)
        if res is not None:
            return res

    if not node.perturbed:
        fv = self.get_forward_value(node)
        node.lower, node.upper = node.interval = (fv, fv)

    to_be_deleted_bounds = []
    if not hasattr(node, 'interval'):
        for n in node.inputs:
            if not hasattr(n, 'interval'):
                # Node n does not have interval bounds; we must compute it.
                self.IBP_general(
                    n, delete_bounds_after_use=delete_bounds_after_use)
                to_be_deleted_bounds.append(n)
        inp = [n_pre.interval for n_pre in node.inputs]
        if (C is not None and isinstance(node, BoundLinear)
                and not node.is_input_perturbed(1)):
            # merge the last BoundLinear node with the specification, available
            # when weights of this layer are not perturbed
            ret = node.interval_propagate(*inp, C=C)
            _delete_unused_bounds(to_be_deleted_bounds)
            return ret
        else:
            node.interval = node.interval_propagate(*inp)

        node.lower, node.upper = node.interval
        if isinstance(node.lower, torch.Size):
            node.lower = torch.tensor(node.lower)
        if isinstance(node.upper, torch.Size):
            node.upper = torch.tensor(node.upper)

        # Handle NaNs in lower and upper bounds
        if torch.isnan(node.lower).any():
            print(
                f'[Interval Warning] NaN detected in lower bounds of node {node}. '
                f'Replacing with -inf.'
            )
            node.lower = torch.where(
                torch.isnan(node.lower),
                torch.full_like(node.lower, float('-inf')),
                node.lower
            )
        if torch.isnan(node.upper).any():
            print(
                f'[Interval Warning] NaN detected in upper bounds of node {node}. '
                f'Replacing with +inf.'
            )
            node.upper = torch.where(
                torch.isnan(node.upper),
                torch.full_like(node.upper, float('inf')),
                node.upper
            )
        node.interval = Interval.make_interval(node.lower, node.upper, other=node.interval)

    if C is not None:
        _delete_unused_bounds(to_be_deleted_bounds)
        return BoundLinear.interval_propagate(None, node.interval, C=C)
    else:
        _delete_unused_bounds(to_be_deleted_bounds)
        return node.interval


def _IBP_loss_fusion(self: 'BoundedModule', node, C):
    """Merge BoundLinear, BoundGatherElements and BoundSub.

    Improvement when loss fusion is used in training.
    """
    # not using loss fusion
    if not self.bound_opts.get('loss_fusion', False):
        return None

    # Currently this function has issues in more complicated networks.
    if self.bound_opts.get('no_ibp_loss_fusion', False):
        return None

    if (C is None and isinstance(node, BoundSub)
            and isinstance(node.inputs[1], BoundGatherElements)
            and isinstance(node.inputs[0], BoundLinear)):
        node_gather = node.inputs[1]
        node_linear = node.inputs[0]
        node_start = node_linear.inputs[0]
        w = node_linear.inputs[1].param
        b = node_linear.inputs[2].param
        labels = node_gather.inputs[1]
        if not hasattr(node_start, 'interval'):
            self.IBP_general(node_start)
        for n in node_gather.inputs:
            if not hasattr(n, 'interval'):
                self.IBP_general(n)
        if torch.isclose(labels.lower, labels.upper, 1e-8).all():
            labels = labels.lower
            batch_size = labels.shape[0]
            w = w.expand(batch_size, *w.shape)
            w = w - torch.gather(
                w, dim=1,
                index=labels.unsqueeze(-1).repeat(1, w.shape[1], w.shape[2]))
            b = b.expand(batch_size, *b.shape)
            b = b - torch.gather(b, dim=1,
                                    index=labels.repeat(1, b.shape[1]))
            lower, upper = node_start.interval
            lower, upper = lower.unsqueeze(1), upper.unsqueeze(1)
            node.lower, node.upper = node_linear.interval_propagate(
                (lower, upper), (w, w), (b.unsqueeze(1), b.unsqueeze(1)))
            node.interval = node.lower, node.upper = (
                node.lower.squeeze(1), node.upper.squeeze(1))
            return node.interval

    return None


def check_IBP_intermediate(self: 'BoundedModule', node):
    """ Check if we use IBP bounds to compute intermediate bounds on this node.

    Currently, assume all eligible operators have exactly one input.
    """
    tighten_input_bounds = (
        self.bound_opts['optimize_bound_args']['tighten_input_bounds']
    )
    directly_optimize_layer_names = (
        self.bound_opts['optimize_bound_args']['directly_optimize']
    )
    if isinstance(node, BoundInput) and tighten_input_bounds:
        return False
    if node.name in directly_optimize_layer_names:
        return False

    if self.ibp_nodes is not None and node.name in self.ibp_nodes:
        self.IBP_general(node)
        return True

    if (isinstance(node, BoundReshape)
            and node.inputs[0].is_lower_bound_current()
            and hasattr(node.inputs[1], 'value')):
        # Node for input value.
        val_input = node.inputs[0]
        # Node for input parameter (e.g., shape, permute)
        arg_input = node.inputs[1]
        node.lower = node.forward(val_input.lower, arg_input.value)
        node.upper = node.forward(val_input.upper, arg_input.value)
        node.interval = (node.lower, node.upper)
        return True

    # Use IBP if node.ibp_intermediate == True (for nodes such as ReLU)
    nodes = []
    while (not node.is_lower_bound_current() or not node.is_upper_bound_current()):
        if not node.ibp_intermediate:
            return False
        nodes.append(node)
        node = node.inputs[0]
    nodes.reverse()
    for n in nodes:
        self.IBP_general(n)

    return True


def check_IBP_first_linear(self: 'BoundedModule', node):
    """Here we avoid creating a big C matrix in the first linear layer.
    Disable this optimization when we have beta for intermediate layer bounds.
    Disable this optimization when we need the A matrix of the first nonlinear
    layer, forcibly use CROWN to record A matrix.
    """
    tighten_input_bounds = (
        self.bound_opts['optimize_bound_args']['tighten_input_bounds']
    )
    directly_optimize_layer_names = (
        self.bound_opts['optimize_bound_args']['directly_optimize']
    )
    if isinstance(node, BoundInput) and tighten_input_bounds:
        return False
    if node.name in directly_optimize_layer_names:
        return False

    # This is the list of all intermediate layers where we need to refine.
    if self.intermediate_constr is not None:
        intermediate_beta_enabled_layers = [
            k for v in self.intermediate_constr.values() for k in v]
    else:
        intermediate_beta_enabled_layers = []

    if (node.name not in self.needed_A_dict.keys()
            and (type(node) == BoundLinear
                or type(node) == BoundConv
                and node.name not in intermediate_beta_enabled_layers)):
        if type(node.inputs[0]) == BoundInput:
            node.lower, node.upper = self.IBP_general(node)
            return True

    return False


def compare_with_IBP(self, node, lower, upper, C=None):
    """Re-compute the bounds by IBP given the existing intermediate bounds.
    Update the bounds if IBP gives tighter bounds."""

    lower_ibp, upper_ibp = self.IBP_general(node, C=C, delete_bounds_after_use=True)
    if lower is not None:
        lower = torch.max(lower, lower_ibp)
    if upper is not None:
        upper = torch.min(upper, upper_ibp)
    return lower, upper


================================================
FILE: auto_LiRPA/jacobian.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
"""Handle Jacobian bounds."""

import torch
from auto_LiRPA.bound_ops import JacobianOP, GradNorm  # pylint: disable=unused-import
from auto_LiRPA.bound_ops import (
    BoundInput, BoundAdd, BoundRelu, BoundJacobianInit,
    BoundJacobianOP)
from auto_LiRPA.utils import logger, prod
from collections import deque

from typing import TYPE_CHECKING
if TYPE_CHECKING:
    from .bound_general import BoundedModule


def _expand_jacobian(self):
    self.jacobian_start_nodes = []
    for node in list(self.nodes()):
        if isinstance(node, BoundJacobianOP):
            self.jacobian_start_nodes.append(node.inputs[0])
            expand_jacobian_node(self, node)
    if self.jacobian_start_nodes:
        # Disable unstable options
        self.bound_opts.update({
            'sparse_intermediate_bounds': False,
            'sparse_conv_intermediate_bounds': False,
            'sparse_intermediate_bounds_with_ibp': False,
            'sparse_features_alpha': False,
            'sparse_spec_alpha': False,
        })
        # Optimize new nodes if possible
        self._optimize_graph()
        for node in self.nodes():
            if isinstance(node, BoundRelu):
                node.use_sparse_spec_alpha = node.use_sparse_features_alpha = False
        # If Jacobian nodes are added, we need to redo the forward pass to update the
        # properties of newly added nodes (e.g., output shape, forward value, etc.)
        self.forward(*self.global_input)


def expand_jacobian_node(self, jacobian_node):
    logger.info(f'Expanding Jacobian node {jacobian_node}')

    output_node = jacobian_node.inputs[0]
    input_node = jacobian_node.inputs[1]
    batch_size = output_node.output_shape[0]
    output_dim = prod(output_node.output_shape[1:])

    # Gradient values in `grad` may not be accurate. We do not consider gradient
    # accumulation from multiple succeeding nodes. We only want the shapes but
    # not the accurate values.
    grad = {}
    # Dummy values in grad_start
    grad_start = torch.ones(batch_size, output_dim,
                            *output_node.output_shape[1:], device=self.device)
    grad[output_node.name] = grad_start
    input_node_found = False

    # First BFS pass: traverse the graph, count degrees, and build gradient
    # layers.
    # Degrees of nodes.
    degree = {}
    # Original layer for gradient computation.
    node_grad_ori = {}

    degree[output_node.name] = 0
    queue = deque([output_node])
    while len(queue) > 0:
        node = queue.popleft()

        if node == input_node:
            input_node_found = True
            continue
        elif node.no_jacobian or not node.from_input:
            continue
        else:
            node_grad_ori[node.name] = node.build_gradient_node(grad[node.name])
            node_grad_ori[node.name] += [None] * (
                len(node.inputs) - len(node_grad_ori[node.name]))

        logger.debug(f'Building gradient node for {node}')
        if not isinstance(node, BoundInput):
            for i in range(len(node.inputs)):
                if node_grad_ori[node.name][i] is None:
                    continue
                grad[node.inputs[i].name] = node_grad_ori[
                    node.name][i][0](*node_grad_ori[node.name][i][1])
                if not node.inputs[i].name in degree:
                    degree[node.inputs[i].name] = 0
                    queue.append(node.inputs[i])
                degree[node.inputs[i].name] += 1

    if not input_node_found:
        raise RuntimeError('Input node not found')

    # Second BFS pass: build the backward computational graph
    grad_node = {}
    initial_name = f'/jacobian{output_node.name}{output_node.name}'
    grad_node[output_node.name] = BoundJacobianInit(inputs=[output_node])
    grad_node[output_node.name].name = initial_name
    self.add_nodes([grad_node[output_node.name]])
    queue = deque([output_node])
    while len(queue) > 0:
        node = queue.popleft()

        if node == input_node:
            self.replace_node(jacobian_node, grad_node[node.name])
            continue
        if node.no_jacobian or not node.from_input:
            continue

        logger.debug(f'Converting gradient node for {node}')
        for k in range(len(node.inputs)):
            if node_grad_ori[node.name][k] is None:
                continue
            nodes_op, nodes_in, nodes_out, _ = self._convert_nodes(
                node_grad_ori[node.name][k][0],
                tuple(item.detach()
                      for item in node_grad_ori[node.name][k][1]))
            rename_dict = {}
            assert isinstance(nodes_in[0], BoundInput)
            rename_dict[nodes_in[0].name] = grad_node[node.name].name
            for i in range(1, len(nodes_in)):
                # Assume it's a parameter here
                new_name = f'/jacobian{output_node.name}{node.name}/{k}/params{nodes_in[i].name}'
                rename_dict[nodes_in[i].name] = new_name
            for i in range(len(nodes_op)):
                # intermediate nodes
                if not nodes_op[i].name in rename_dict:
                    new_name = f'/jacobian{output_node.name}{node.name}/{k}/tmp{nodes_op[i].name}'
                    rename_dict[nodes_op[i].name] = new_name
            assert len(nodes_out) == 1
            nodes_out = nodes_out[0]
            rename_dict[nodes_out.name] = f'/jacobian{output_node.name}{node.name}/{k}/output'

            self.rename_nodes(nodes_op, nodes_in, rename_dict)
            input_nodes_replace = (
                [self._modules[nodes_in[0].name]] + node_grad_ori[node.name][k][2])
            for i in range(len(input_nodes_replace)):
                for n in nodes_op:
                    for j in range(len(n.inputs)):
                        if n.inputs[j].name == nodes_in[i].name:
                            n.inputs[j] = input_nodes_replace[i]
            self.add_nodes(nodes_op + nodes_in[len(input_nodes_replace):])

            if node.inputs[k].name in grad_node:
                node_cur = grad_node[node.inputs[k].name]
                node_add = BoundAdd(
                    attr=None, inputs=[node_cur, nodes_out],
                    output_index=0, options={})
                node_add.name = f'{nodes_out.name}/add'
                grad_node[node.inputs[k].name] = node_add
                self.add_nodes([node_add])
            else:
                grad_node[node.inputs[k].name] = nodes_out
            degree[node.inputs[k].name] -= 1
            if degree[node.inputs[k].name] == 0:
                queue.append(node.inputs[k])


def compute_jacobian_bounds(self: 'BoundedModule', x, optimize=True,
                            optimize_output_node=None,
                            bound_lower=True, bound_upper=True):
    """Compute jacobian bounds on the pre-augmented graph (new API)."""

    if isinstance(x, torch.Tensor):
        x = (x,)

    if optimize:
        if optimize_output_node is None:
            if len(self.jacobian_start_nodes) == 1:
                optimize_output_node = self.jacobian_start_nodes[0]
            else:
                raise NotImplementedError(
                    'Multiple Jacobian nodes found.'
                    'An output node for optimizable bounds (optimize_output_node) '
                    'must be specified explicitly')
        self.compute_bounds(
            method='CROWN-Optimized',
            C=None, x=x, bound_upper=False,
            final_node_name=optimize_output_node.name)
        intermediate_bounds = {}
        for node in self._modules.values():
            if node.is_lower_bound_current():
                intermediate_bounds[node.name] = (node.lower, node.upper)
    else:
        intermediate_bounds = None
    lb, ub = self.compute_bounds(
        method='CROWN', x=x,
        bound_lower=bound_lower, bound_upper=bound_upper,
        interm_bounds=intermediate_bounds)
    return lb, ub


================================================
FILE: auto_LiRPA/linear_bound.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
class LinearBound:
    def __init__(
            self, lw=None, lb=None, uw=None, ub=None, lower=None, upper=None,
            from_input=None, x_L=None, x_U=None, offset=0, tot_dim=None):
        self.lw = lw
        self.lb = lb
        self.uw = uw
        self.ub = ub
        self.lower = lower
        self.upper = upper
        self.from_input = from_input
        self.x_L = x_L
        self.x_U = x_U
        # Offset for input variables. Used for batched forward bound
        # propagation.
        self.offset = offset
        if tot_dim is not None:
            self.tot_dim = tot_dim
        elif lw is not None:
            self.tot_dim = lw.shape[1]
        else:
            self.tot_dim = 0

    def is_single_bound(self):
        """Check whether the linear lower bound and the linear upper bound are
        the same."""
        if (self.lw is not None and self.uw is not None
                and self.lb is not None and self.ub is not None):
            return (self.lw.data_ptr() == self.uw.data_ptr()
                and self.lb.data_ptr() == self.ub.data_ptr()
                and self.x_L is not None and self.x_U is not None)
        else:
            return True


================================================
FILE: auto_LiRPA/operators/__init__.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
from .base import *
from .linear import *
from .convolution import *
from .pooling import *
from .activation_base import *
from .activations import *
from .s_shaped import *
from .relu import *
from .bivariate import *
from .add_sub import *
from .normalization import *
from .shape import *
from .reduce import *
from .rnn import *
from .softmax import *
from .constant import *
from .leaf import *
from .logical import *
from .dropout import *
from .dtype import *
from .trigonometric import *
from .cut_ops import *
from .solver_utils import grb
from .resize import *
from .jacobian import *
from .indexing import *
from .slice_concat import *
from .reshape import *
from .minmax import *
from .convex_concave import *
from .gelu import *
from .tile import *


================================================
FILE: auto_LiRPA/operators/activation_base.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
""" Activation operators or other unary nonlinear operators"""
import torch
from torch import Tensor
from collections import OrderedDict
from .base import *
from .clampmult import multiply_by_A_signs

torch._C._jit_set_profiling_executor(False)
torch._C._jit_set_profiling_mode(False)


class BoundActivation(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.requires_input_bounds = [0]
        self.use_default_ibp = True
        self.splittable = False
        # "core" region of input where precomputation can be done
        self.range_l = -10
        self.range_u = 10

    def _init_masks(self, x):
        self.mask_pos = x.lower >= 0
        self.mask_neg = x.upper <= 0
        self.mask_both = torch.logical_not(torch.logical_or(self.mask_pos, self.mask_neg))

    def init_linear_relaxation(self, x):
        self._init_masks(x)
        self.lw = torch.zeros_like(x.lower)
        self.lb = self.lw.clone()
        self.uw = self.lw.clone()
        self.ub = self.lw.clone()

    def add_linear_relaxation(self, mask, type, k, x0, y0=None):
        if y0 is None:
            y0 = self.forward(x0)

        if type == 'lower':
            w_out, b_out = self.lw, self.lb
        else:
            w_out, b_out = self.uw, self.ub

        if mask is None:
            if isinstance(k, Tensor) and k.ndim > 0:
                w_out[:] = k
            else:
                w_out.fill_(k)
        else:
            w_out[..., mask] = (k[..., mask].to(w_out) if isinstance(k, Tensor)
                                else k)

        if (not isinstance(x0, Tensor) and x0 == 0
                and not isinstance(y0, Tensor) and y0 == 0):
            pass
        else:
            b = -x0 * k + y0
            if mask is None:
                if b.ndim > 0:
                    b_out[:] = b
                else:
                    b_out.fill_(b)
            else:
                b_out[..., mask] = b[..., mask]

    def bound_relax(self, x, init=False):
        return not_implemented_op(self, 'bound_relax')

    def bound_backward(self, last_lA, last_uA, x, reduce_bias=True, **kwargs):
        self.bound_relax(x, init=True)

        def _bound_oneside(last_A, sign=-1):
            if last_A is None:
                return None, 0
            if sign == -1:
                w_pos, b_pos, w_neg, b_neg = (
                    self.lw.unsqueeze(0), self.lb.unsqueeze(0),
                    self.uw.unsqueeze(0), self.ub.unsqueeze(0))
            else:
                w_pos, b_pos, w_neg, b_neg = (
                    self.uw.unsqueeze(0), self.ub.unsqueeze(0),
                    self.lw.unsqueeze(0), self.lb.unsqueeze(0))
            w_pos = maybe_unfold_patches(w_pos, last_A)
            w_neg = maybe_unfold_patches(w_neg, last_A)
            b_pos = maybe_unfold_patches(b_pos, last_A)
            b_neg = maybe_unfold_patches(b_neg, last_A)
            if self.batch_dim == 0:
                _A, _bias = multiply_by_A_signs(
                    last_A, w_pos, w_neg, b_pos, b_neg, reduce_bias=reduce_bias)
            elif self.batch_dim == -1:
                # FIXME: why this is different from above?
                assert reduce_bias
                mask = torch.gt(last_A, 0.).to(torch.float)
                _A = last_A * (mask * w_pos.unsqueeze(1) +
                               (1 - mask) * w_neg.unsqueeze(1))
                _bias = last_A * (mask * b_pos.unsqueeze(1) +
                                  (1 - mask) * b_neg.unsqueeze(1))
                if _bias.ndim > 2:
                    _bias = torch.sum(_bias, dim=list(range(2, _bias.ndim)))
            else:
                raise NotImplementedError

            return _A, _bias

        lA, lbias = _bound_oneside(last_lA, sign=-1)
        uA, ubias = _bound_oneside(last_uA, sign=+1)

        return [(lA, uA)], lbias, ubias

    @staticmethod
    @torch.jit.script
    def bound_forward_w(
            relax_lw: Tensor, relax_uw: Tensor, x_lw: Tensor, x_uw: Tensor, dim: int):
        lw = (relax_lw.unsqueeze(dim).clamp(min=0) * x_lw +
              relax_lw.unsqueeze(dim).clamp(max=0) * x_uw)
        uw = (relax_uw.unsqueeze(dim).clamp(max=0) * x_lw +
              relax_uw.unsqueeze(dim).clamp(min=0) * x_uw)
        return lw, uw

    @staticmethod
    @torch.jit.script
    def bound_forward_b(
            relax_lw: Tensor, relax_uw: Tensor, relax_lb: Tensor,
            relax_ub: Tensor, x_lb: Tensor, x_ub: Tensor):
        lb = relax_lw.clamp(min=0) * x_lb + relax_lw.clamp(max=0) * x_ub + relax_lb
        ub = relax_uw.clamp(max=0) * x_lb + relax_uw.clamp(min=0) * x_ub + relax_ub
        return lb, ub

    def bound_forward(self, dim_in, x):
        self.bound_relax(x, init=True)

        assert (x.lw is None) == (x.uw is None)

        dim = 1 if self.lw.ndim > 0 else 0

        if x.lw is not None:
            lw, uw = BoundActivation.bound_forward_w(
                self.lw, self.uw, x.lw, x.uw, dim)
        else:
            lw = uw = None
        lb, ub = BoundActivation.bound_forward_b(
            self.lw, self.uw, self.lb, self.ub, x.lb, x.ub)

        return LinearBound(lw, lb, uw, ub)

    def interval_propagate(self, *v):
        h_L, h_U = v[0][0], v[0][1]
        return self.forward(h_L), self.forward(h_U)

    def get_split_mask(self, lower, upper, input_index):
        """Return a mask to indicate if each neuron potentially needs a split.

        0: Stable (linear) neuron; 1: unstable (nonlinear) neuron.
        """
        return torch.ones_like(lower, dtype=torch.bool)

    # Return heuristic to select which neuron should use constraints_solving concretization
    def compute_bound_improvement_heuristics(self, lower, upper):
        """Return a heuristic score for each lower-upper bound pair.
        It indicates the possible bound improvement for each neuron.
        We will then choose if a neuron's bound needs further tightened based on the heuristic.
        """
        return (-lower * upper).clamp(min=0)

    def get_unstable_mask(self, lower, upper):
        """Return a mask to indicate if each neuron is unstable.
            Here we mark all the neurons as stable by default.

        0: Stable (linear) neuron; 1: unstable (nonlinear) neuron.
        """
        return torch.ones_like(lower, dtype=torch.bool)

class BoundOptimizableActivation(BoundActivation):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        if 'optimize_bound_args' not in self.options:
            self.options['optimize_bound_args'] = {}
        self.optimizable = True
        # Stages:
        #   * `init`: initializing parameters
        #   * `opt`: optimizing parameters
        #   * `reuse`: not optimizing parameters but reuse saved values
        # If `None`, it means activation optimization is currently not used.
        self.opt_stage = None
        self.alpha = OrderedDict()
        # Save patch sizes during bound_backward() for each output_node.
        self.patch_size = {}
        # A torch.bool mask of shape Tensor([batch_size]) that conditions the
        # sample of alpha and beta to update
        # If set to None, update all samples
        # If not None, select those corresponding to 1 to update

    def opt_init(self):
        """Enter the stage for initializing bound optimization. Optimized bounds
        are not used in this stage."""
        self.opt_stage = 'init'

    def opt_start(self):
        """Start optimizing bounds."""
        self.opt_stage = 'opt'

    def opt_reuse(self):
        """ Reuse optimizing bounds """
        self.opt_stage = 'reuse'

    def opt_no_reuse(self):
        """ Finish reusing optimized bounds """
        if self.opt_stage == 'reuse':
            self.opt_stage = None

    def opt_end(self):
        """ End optimizing bounds """
        self.opt_stage = None

    def clip_alpha(self):
        pass

    def init_opt_parameters(self, start_nodes):
        """ start_nodes: a list of starting nodes [(node, size)] during
        CROWN backward bound propagation"""
        self.alpha = OrderedDict()
        for start_node in start_nodes:
            ns, size_s = start_node[:2]
            # TODO do not give torch.Size
            if isinstance(size_s, (torch.Size, list, tuple)):
                size_s = prod(size_s)
            self.alpha[ns] = self._init_opt_parameters_impl(size_s, name_start=ns)

    def _init_opt_parameters_impl(self, size_spec, name_start=None):
        """Implementation of init_opt_parameters for each start_node."""
        raise NotImplementedError

    def init_linear_relaxation(self, x, dim_opt=None):
        self._init_masks(x)
        # The first dimension of size 2 is used for lA and uA respectively,
        # when computing intermediate bounds.
        if self.opt_stage in ['opt', 'reuse'] and dim_opt is not None:
            # For optimized bounds, we have independent lw for each output
            # dimension for bound optimization.
            # If the output layer is a fully connected layer, len(dim_opt) = 1.
            # If the output layer is a conv layer, len(dim_opt) = 3 but we only
            # use the out_c dimension to create slopes/bias.
            # Variables are shared among out_h, out_w dimensions so far.
            if isinstance(dim_opt, int):
                dim = dim_opt
            elif isinstance(dim_opt, torch.Size):
                dim = prod(dim_opt)
            else:
                dim = dim_opt[0]
            self.lw = torch.zeros(2, dim, *x.lower.shape).to(x.lower)
        else:
            # Without optimized bounds, the lw, lb (slope, biase) etc only
            # depend on intermediate layer bounds,
            # and are shared among different output dimensions.
            self.lw = torch.zeros_like(x.lower)
        self.lb = self.lw.clone()
        self.uw = self.lw.clone()
        self.ub = self.lw.clone()

    def bound_relax(self, x, init=False, dim_opt=None):
        return not_implemented_op(self, 'bound_relax')

    def bound_backward(self, last_lA, last_uA, x, start_node=None,
                       start_shape=None, reduce_bias=True, **kwargs):
        self._start = start_node.name
        if self.opt_stage not in ['opt', 'reuse']:
            last_A = last_lA if last_lA is not None else last_uA
            # Returned [(lA, uA)], lbias, ubias
            As, lbias, ubias = super().bound_backward(
                last_lA, last_uA, x, reduce_bias=reduce_bias)
            if isinstance(last_A, Patches):
                A_prod = As[0][1].patches if As[0][0] is None else As[0][1].patches
                # FIXME: Unify this function with BoundReLU
                # Save the patch size, which will be used in init_slope() to
                # determine the number of optimizable parameters.
                if start_node is not None:
                    if last_A.unstable_idx is not None:
                        # Sparse patches, we need to construct the full patch size:
                        # (out_c, batch, out_h, out_w, c, h, w).
                        self.patch_size[start_node.name] = [
                            last_A.output_shape[1], A_prod.size(1),
                            last_A.output_shape[2], last_A.output_shape[3],
                            A_prod.size(-3), A_prod.size(-2), A_prod.size(-1)]
                    else:
                        # Regular patches.
                        self.patch_size[start_node.name] = A_prod.size()
            return As, lbias, ubias
        assert self.batch_dim == 0

        self.bound_relax(x, init=True, dim_opt=start_shape)

        def _bound_oneside(last_A, sign=-1):
            if last_A is None:
                return None, 0
            if sign == -1:
                w_pos, b_pos, w_neg, b_neg = self.lw[0], self.lb[0], self.uw[0], self.ub[0]
            else:
                w_pos, b_pos, w_neg, b_neg = self.uw[1], self.ub[1], self.lw[1], self.lb[1]
            w_pos = maybe_unfold_patches(w_pos, last_A)
            w_neg = maybe_unfold_patches(w_neg, last_A)
            b_pos = maybe_unfold_patches(b_pos, last_A)
            b_neg = maybe_unfold_patches(b_neg, last_A)
            unstable_idx = kwargs.get('unstable_idx', None)
            if unstable_idx is not None:
                assert isinstance(unstable_idx, Tensor) and unstable_idx.ndim == 1
                # Shape is (spec, batch, neurons).
                # FIXME: Sigmoid and other activation functions should also support
                # sparse-spec alpha, so alpha will be created with a smaller shape.
                w_pos = self.non_deter_index_select(w_pos, index=unstable_idx, dim=0)
                w_neg = self.non_deter_index_select(w_neg, index=unstable_idx, dim=0)
                b_pos = self.non_deter_index_select(b_pos, index=unstable_idx, dim=0)
                b_neg = self.non_deter_index_select(b_neg, index=unstable_idx, dim=0)
            A_prod, _bias = multiply_by_A_signs(
                last_A, w_pos, w_neg, b_pos, b_neg, reduce_bias)
            return A_prod, _bias

        lA, lbias = _bound_oneside(last_lA, sign=-1)
        uA, ubias = _bound_oneside(last_uA, sign=+1)

        return [(lA, uA)], lbias, ubias

    def _no_bound_parameters(self):
        raise AttributeError('Bound parameters have not been initialized.'
                             'Please call `compute_bounds` with `method=CROWN-optimized`'
                             ' at least once.')

    def _transfer_alpha(self, alpha, device=None, dtype=None, non_blocking=False, require_grad=False):
        alpha = {spec_name: transfer(alpha_value, device=device, dtype=dtype, non_blocking=non_blocking).detach().requires_grad_(require_grad)
                    for spec_name, alpha_value in alpha.items()}
        return alpha

    def dump_alpha(self, device=None, dtype=None, non_blocking=False):
        """
        Dump alpha parameters to a dictionary.
        """
        return {'alpha': self._transfer_alpha(self.alpha, device=device, dtype=dtype, non_blocking=non_blocking, require_grad=False)}

    def restore_alpha(self, alpha, device=None, dtype=None, non_blocking=False):
        """
        Restore alpha parameters from a dictionary.
        """
        self.alpha = self._transfer_alpha(alpha['alpha'], device=device, dtype=dtype, non_blocking=non_blocking, require_grad=True)

    def drop_unused_alpha(self, keep_nodes):
        """
        Drop unused alpha parameters based on the keep_nodes.
        This function is not used in auto_LiRPA for now, but is used in alpha-beta-CROWN.
        """
        for spec_name in list(self.alpha.keys()):
            if spec_name not in keep_nodes:
                del self.alpha[spec_name]

================================================
FILE: auto_LiRPA/operators/activations.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
""" Activation operators or other unary nonlinear operators, not including
those placed in separate files."""
import torch
from torch.nn import Module
from .base import *
from .activation_base import BoundActivation, BoundOptimizableActivation
from .clampmult import multiply_by_A_signs

torch._C._jit_set_profiling_executor(False)
torch._C._jit_set_profiling_mode(False)


class BoundSoftplus(BoundActivation):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.softplus = nn.Softplus()

    def forward(self, x):
        return self.softplus(x)


class BoundAbs(BoundActivation):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.ibp_intermediate = True

    def forward(self, x):
        return x.abs()

    def bound_relax(self, x, init=False):
        if init:
            self.init_linear_relaxation(x)
        x_L = x.lower.clamp(max=0)
        x_U = torch.max(x.upper.clamp(min=0), x_L + 1e-8)
        # upper_k: connect (x_L, |x_L|) and (x_U, |x_U|)
        upper_k = (x_U.abs() - x_L.abs()) / (x_U - x_L)
        # lower_k: choose between -1 and 1 depending on which is closer to zero
        lower_k = (x_U > -x_L).to(x_L) * 2 - 1
        self.add_linear_relaxation(mask=None, type='upper', k=upper_k, x0=x_L)
        self.add_linear_relaxation(mask=None, type='lower', k=lower_k, x0=0, y0=0)

    def bound_backward(self, last_lA, last_uA, x, **kwargs):
        x_L = x.lower.clamp(max=0)
        x_U = torch.max(x.upper.clamp(min=0), x_L + 1e-8)
        mask_neg = x_U <= 0
        mask_pos = x_L >= 0
        y_L = x_L.abs()
        y_U = x_U.abs()
        upper_k = (y_U - y_L) / (x_U - x_L)
        upper_b = y_L - upper_k * x_L
        # TODO: Here for the "mask_both" case lower_k = 0, but not sure if it's optimal.
        # lower_b should just be 0?
        lower_k = (mask_neg * (-1.0) + mask_pos * 1.0)
        lower_b = (mask_neg + mask_pos) * (y_L - lower_k * x_L)
        if last_uA is not None:
            # Special case if we only want the upper bound with non-negative coefficients
            if last_uA.min() >= 0:
                uA = last_uA * upper_k
                ubias = self.get_bias(last_uA, upper_b)
            else:
                last_uA_pos = last_uA.clamp(min=0)
                last_uA_neg = last_uA.clamp(max=0)
                uA = last_uA_pos * upper_k + last_uA_neg * lower_k
                ubias = (self.get_bias(last_uA_pos, upper_b)
                         + self.get_bias(last_uA_neg, lower_b))
        else:
            uA, ubias = None, 0
        if last_lA is not None:
            if last_lA.max() <= 0:
                lA = last_lA * upper_k
                lbias = self.get_bias(last_lA, upper_b)
            else:
                last_lA_pos = last_lA.clamp(min=0)
                last_lA_neg = last_lA.clamp(max=0)
                lA = last_lA_pos * lower_k + last_lA_neg * upper_k
                lbias = (self.get_bias(last_lA_pos, lower_b)
                         + self.get_bias(last_lA_neg, upper_b))
        else:
            lA, lbias = None, 0
        return [(lA, uA)], lbias, ubias

    def interval_propagate(self, *v):
        h_L, h_U = v[0][0], v[0][1]
        lower = ((h_U < 0) * h_U.abs() + (h_L > 0) * h_L.abs())
        upper = torch.max(h_L.abs(), h_U.abs())
        return lower, upper


class BoundATenHeaviside(BoundOptimizableActivation):
    def forward(self, *x):
        self.input_shape = x[0].shape
        # x[0]: input; x[1]: value when x == 0
        return torch.heaviside(x[0], x[1])

    def interval_propagate(self, *v):
        assert not self.is_input_perturbed(1)
        return self.forward(v[0][0], v[1][0]), self.forward(v[0][1], v[1][0])

    def _init_opt_parameters_impl(self, size_spec, name_start):
        """Implementation of init_opt_parameters for each start_node."""
        l = self.inputs[0].lower
        return torch.zeros_like(l).unsqueeze(0).repeat(2, *[1] * l.ndim)

    def clip_alpha(self):
        for v in self.alpha.values():
            v.data = torch.clamp(v.data, 0., 1.)

    def bound_backward(self, last_lA, last_uA, *x, start_node=None,
                       start_shape=None, **kwargs):
        x = x[0]
        if x is not None:
            lb_r = x.lower
            ub_r = x.upper
        else:
            lb_r = self.lower
            ub_r = self.upper

        if self.opt_stage not in ['opt', 'reuse']:
            # zero slope:
            upper_d = torch.zeros_like(lb_r, device=lb_r.device, dtype=lb_r.dtype)
            lower_d = torch.zeros_like(ub_r, device=ub_r.device, dtype=ub_r.dtype)
        else:
            upper_d = self.alpha[start_node.name][0].clamp(0, 1) * (1. / (-lb_r).clamp(min=1e-3))
            lower_d = self.alpha[start_node.name][1].clamp(0, 1) * (1. / (ub_r.clamp(min=1e-3)))

        upper_b = torch.ones_like(lb_r, device=lb_r.device, dtype=lb_r.dtype)
        lower_b = torch.zeros_like(lb_r, device=lb_r.device, dtype=lb_r.dtype)
        # For stable neurons, set fixed slope and bias.
        ub_mask = (ub_r <= 0).to(dtype=ub_r.dtype)
        lb_mask = (lb_r >= 0).to(dtype=lb_r.dtype)
        upper_b = upper_b - upper_b * ub_mask
        lower_b = lower_b * (1. - lb_mask) + lb_mask
        upper_d = upper_d - upper_d * ub_mask - upper_d * lb_mask
        lower_d = lower_d - lower_d * lb_mask - lower_d * ub_mask
        upper_d = upper_d.unsqueeze(0)
        lower_d = lower_d.unsqueeze(0)
        # Choose upper or lower bounds based on the sign of last_A
        uA = lA = None
        ubias = lbias = 0
        if last_uA is not None:
            neg_uA = last_uA.clamp(max=0)
            pos_uA = last_uA.clamp(min=0)
            uA = upper_d * pos_uA + lower_d * neg_uA
            ubias = (pos_uA * upper_b + neg_uA * lower_b).flatten(2).sum(-1)
        if last_lA is not None:
            neg_lA = last_lA.clamp(max=0)
            pos_lA = last_lA.clamp(min=0)
            lA = upper_d * neg_lA + lower_d * pos_lA
            lbias = (pos_lA * lower_b + neg_lA * upper_b).flatten(2).sum(-1)

        return [(lA, uA), (None, None)], lbias, ubias


class BoundSqr(BoundOptimizableActivation):

    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.splittable = True

    def forward(self, x):
        return x ** 2

    def bound_relax(self, x, init=False, dim_opt=None):
        if init:
            self.init_linear_relaxation(x, dim_opt)
        upper_k = x.lower + x.upper
        # Upper bound: connect the two points (x_l, x_l^2) and (x_u, x_u^2).
        # The upper bound should always be better than IBP.
        self.add_linear_relaxation(
            mask=None, type='upper', k=upper_k, x0=x.lower)

        if self.opt_stage in ['opt', 'reuse']:
            mid = self.alpha[self._start]
        else:
            # Lower bound is a z=0 line if x_l and x_u have different signs.
            # Otherwise, the lower bound is a tangent line at x_l.
            # The lower bound should always be better than IBP.
            # If both x_l and x_u < 0, select x_u. If both > 0, select x_l.
            # If x_l < 0 and x_u > 0, we use the z=0 line as the lower bound.
            mid = F.relu(x.lower) - F.relu(-x.upper)

        self.add_linear_relaxation(mask=None, type='lower', k=2 * mid, x0=mid)

    def _init_opt_parameters_impl(self, size_spec, **kwargs):
        """Implementation of init_opt_parameters for each start_node."""
        l, u = self.inputs[0].lower, self.inputs[0].upper
        alpha = torch.empty(2, size_spec, *l.shape, device=l.device)
        alpha.data[:2] = F.relu(l) - F.relu(-u)
        return alpha

    def interval_propagate(self, *v):
        h_L, h_U = v[0][0], v[0][1]
        lower = ((h_U < 0) * (h_U ** 2) + (h_L > 0) * (h_L ** 2))
        upper = torch.max(h_L ** 2, h_U ** 2)
        return lower, upper

    def build_gradient_node(self, grad_upstream):
        return [(SqrGrad(), (grad_upstream, self.inputs[0].forward_value), [self.inputs[0]])]


class SqrGrad(Module):
    def forward(self, grad_last, preact):
        # (x^2)' = 2*x
        return grad_last * 2 * preact.unsqueeze(1)


class BoundHardTanh(BoundActivation):

    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.splittable = True
        self.activation_name = "HardTanh"
        self.patch_size = {}
        self.hardtanh_options = options.get('hardtanh', 'same-slope')

    def forward(self, x, min_val, max_val):
        return F.hardtanh(x, min_val, max_val)

    def bound_backward(self, last_lA, last_uA, x, min_val, max_val, start_node=None,
                       unstable_idx=None, reduce_bias=True, **kwargs):
        if self.is_input_perturbed(1) or self.is_input_perturbed(
                2):  # Checking if min_value and max_value are not perturbed
            raise NotImplementedError(
                    f'{self.activation_name} is unsupported with perturbed min_val and max_val')

        self.bound_relax(x, min_val, max_val, init=True)

        def _bound_oneside(last_A, sign=-1):
            if last_A is None:
                return None, 0
            if sign == -1:
                w_pos, b_pos, w_neg, b_neg = (
                    self.lw.unsqueeze(0), self.lb.unsqueeze(0),
                    self.uw.unsqueeze(0), self.ub.unsqueeze(0))
            else:
                w_pos, b_pos, w_neg, b_neg = (
                    self.uw.unsqueeze(0), self.ub.unsqueeze(0),
                    self.lw.unsqueeze(0), self.lb.unsqueeze(0))
            w_pos = maybe_unfold_patches(w_pos, last_A)
            w_neg = maybe_unfold_patches(w_neg, last_A)
            b_pos = maybe_unfold_patches(b_pos, last_A)
            b_neg = maybe_unfold_patches(b_neg, last_A)

            # Shapes of w_pos, w_neg, b_pos, b_neg
            # For toy.py - Final Shape - torch.Size([1, 1, 2]) torch.Size([1, 1, 2]) torch.Size([1, 1, 2]) torch.Size([1, 1, 2])
            # For simple_verification.py - Final Shape -  torch.Size([1, 2, 16, 14, 14]) torch.Size([1, 2, 16, 14, 14]) torch.Size([1, 2, 16, 14, 14]) torch.Size([1, 2, 16, 14, 14])

            # For all tensors having batch as the first dimension (batch,.....)
            _A, _bias = multiply_by_A_signs(
                last_A, w_pos, w_neg, b_pos, b_neg)

            return _A, _bias

        lA, lbias = _bound_oneside(last_lA, sign=-1)
        uA, ubias = _bound_oneside(last_uA, sign=+1)

        return [(lA, uA), (None, None), (None, None)], lbias, ubias

    def bound_relax(self, x, min_val, max_val, init=False, dim_opt=None):
        epsilon = 1e-8
        preact_lb = x.lower.clamp(max=max_val.value)
        preact_ub = torch.max(x.upper.clamp(min=min_val.value), preact_lb + epsilon)

        min_val = min_val.value
        max_val = max_val.value

        uw = torch.zeros_like(preact_ub)
        ub = torch.zeros_like(preact_ub)
        lw = torch.zeros_like(preact_lb)
        lb = torch.zeros_like(preact_lb)

        # Case 1:
        # When upper bound is smaller than min value,
        # the activated value will always be min value,
        # so the upper bound and lower bound are both
        # min value.
        case1 = (preact_ub <= min_val).to(preact_ub.dtype)

        # Computing intermediate values only once for Case 1
        value = case1 * min_val
        ub += value
        lb += value

        # Case 2:
        # When lower bound is larger than max value,
        # the activated value will always be max value,
        # so the upper bound and lower bound are both
        # max value.
        case2 = (preact_lb >= max_val).to(preact_ub.dtype)

        # Computing intermediate values only once for Case 2
        value = case2 * max_val
        ub += value
        lb += value

        # Case 3:
        # In this case, the activated output for x is always x
        # so the bias is always zero and slope will also always
        # be one.
        case3 = ((preact_lb >= min_val) & (preact_ub <= max_val)).to(preact_ub.dtype)
        uw += case3
        lw += case3

        # Case 4:
        # Upper bound is larger than max val and lower bound is
        # smaller than min val, in this case, we will use two
        # line to bound, the upper bound will pass through points
        # (max_val, max_val) and (lb_r, min_val) and the lower
        # bound will pass through (min_val, min_val) and (ub_r, max_val).
        # So, the slope d of the upper line is (max_val - min_val)/(max_val - lb_r)
        # and the intercept of the upper line is max_val - d * max_val
        # Similarly, the slope d of the lower line is (max_val - min_val)/(ub_r - min_val)
        # and the intercept of the lower line is min_val - d * min_val.

        # Computing intermediate values only once for Case 4
        diff = max_val - min_val
        val1 = max_val - preact_lb + epsilon

        case4 = ((preact_lb < min_val) & (preact_ub > max_val)).to(preact_ub.dtype)
        uw += case4 * diff / val1
        lw += case4 * diff / (preact_ub - min_val + epsilon)
        ub = case4 * (max_val - diff / val1 * max_val)
        lb = case4 * (min_val - diff / (preact_ub - min_val + epsilon) * min_val)

        # Computing intermediate values only once ( Case 5 & 6 )
        denom = preact_ub - preact_lb + epsilon

        # Case 5:
        # Lower bound is smaller than the min val and the upper bound
        # is larger than or equal to the min val and smaller or
        # equal to max val. In this case, we use a single line that
        # pass through (lb_r, min_val) and (ub_r, ub_r) as the upper
        # bound. And for lower bound, we use a line with the same slope
        # as the upper bound and passes through (min_val, min_val) as
        # lower bound.
        # So, the slope d of the upper bound is (ub_r - min_val)/(ub_r - lb_r)
        # and the intercept of the upper bound is ub_r - d * ub_r.
        # The slope d of the lower bound is same as upper bound and the
        # intercept of the lower bound is min_val - d * min_val

        # Computing intermediate values only once for Case 5
        val1 = preact_ub - min_val
        case5 = ((preact_lb < min_val) & (min_val <= preact_ub) & (preact_ub <= max_val)).to(preact_ub.dtype)
        uw += case5 * val1 / denom
        ub += case5 * (preact_ub - val1 / denom * preact_ub)

        if self.hardtanh_options == "same-slope":
            lw += case5 * val1 / denom
            lb += case5 * (min_val - val1 / denom * min_val)

        elif self.hardtanh_options == "adaptive":
            cond = (uw > 0.5).to(uw)
            lw += case5 * cond
            lb += case5 * min_val * (1 - cond)

        # Case 6:
        # Upper bound is larger than the max val and the lower bound
        # is larger than or equal to the min val and smaller or
        # equal to max val. In this case, we use a single line that
        # pass through (ub_r, max_val) and (lb_r, lb_r) as the lower
        # bound. And for upper bound, we use a line with the same slope
        # as lower bound which passes through (max_val, max_val) as the
        # upper bound.
        # So, the slope d of the lower bound is (max_val - lb_r)/(ub_r - lb_r).
        # And the intercept of the lower bound is lb_r - d * lb_r.
        # The slope d of the upper bound is (max_val - lb_r)/(ub_r - lb_r),
        # and the intercept of the upper bound is max_val - d * max_val.

        # Computing intermediate values only once for Case 6
        val1 = max_val - preact_lb
        case6 = ((min_val <= preact_lb) & (preact_lb <= max_val) & (preact_ub > max_val)).to(preact_ub.dtype)
        lw += case6 * val1 / denom
        lb += case6 * (preact_lb - val1 / denom * preact_lb)

        if self.hardtanh_options == "same-slope":
            uw += case6 * val1 / denom
            ub += case6 * (max_val - val1 / denom * max_val)

        elif self.hardtanh_options == "adaptive":
            cond = (lw > 0.5).to(lw)
            uw += case6 * cond
            ub += (case6 * max_val) * (1 - cond)

        self.uw = uw
        self.lw = lw
        self.ub = ub
        self.lb = lb

    def interval_propagate(self, *v):
        h_L, h_U = v[0][0], v[0][1]
        min_val = v[1][0]
        max_val = v[2][0]
        assert v[1][0] == v[1][1] and v[2][0] == v[2][1]
        return self.forward(h_L, min_val, max_val), self.forward(h_U, min_val, max_val)


class BoundFloor(BoundActivation):
    def forward(self, x):
        return torch.floor(x)

    def bound_relax(self, x, init=False):
        if init:
            self.init_linear_relaxation(x)
        self.lb += torch.floor(x.lower)
        self.ub += torch.floor(x.upper)


class BoundMultiPiecewiseNonlinear(BoundOptimizableActivation):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.splittable = True

    def forward(self, x, weight, offset):
        return (F.relu(x.unsqueeze(-1) - offset) * weight).sum(dim=-1)

    def clip_alpha(self):
        for v in self.alpha.values():
            v.data = torch.clamp(v.data, 0., 1.)

    def bound_backward(self, last_lA, last_uA, x, weight, offset,
                       reduce_bias=True, start_node=None, **kwargs):
        assert not self.is_input_perturbed(1)
        assert not self.is_input_perturbed(2)

        weight = (
            self.inputs[1].forward_value
            if hasattr(self.inputs[1], 'forward_value')
            else self.inputs[1].forward()
        )
        offset = (
            self.inputs[2].forward_value
            if hasattr(self.inputs[2], 'forward_value')
            else self.inputs[2].forward()
        )

        relu_x_lower = (x.lower.unsqueeze(-1) - offset).clamp(max=0)
        relu_x_upper = (x.upper.unsqueeze(-1) - offset).clamp(min=0)
        relu_x_upper = torch.max(relu_x_upper, relu_x_lower + 1e-8)
        relu_upper_k = relu_x_upper / (relu_x_upper - relu_x_lower)
        relu_upper_b = -relu_x_lower * relu_upper_k
        if self.opt_stage not in ['opt', 'reuse']:
            self.init_lower_k = relu_lower_k = (relu_upper_k > 0.5).to(relu_upper_k)
            relu_lower_k_for_lA = relu_lower_k_for_uA = relu_lower_k.unsqueeze(0)
        else:
            relu_lower_k = self.alpha[start_node.name]
            relu_lower_k_for_lA = relu_lower_k[0]
            relu_lower_k_for_uA = relu_lower_k[1]
        relu_lower_b = torch.zeros_like(relu_upper_b)
        relu_lower_b = relu_lower_b.unsqueeze(0)
        relu_upper_k = relu_upper_k.unsqueeze(0)
        relu_upper_b = relu_upper_b.unsqueeze(0)

        def _bound_oneside(last_A, pos_k, pos_b, neg_k, neg_b, weight, offset, reduce_bias):
            if last_A is None:
                return None, 0
            last_A = last_A.unsqueeze(-1) * weight
            A_pos = last_A.clamp(min=0)
            A_neg = last_A.clamp(max=0)
            A = A_pos * pos_k + A_neg * neg_k
            b = -A * offset + A_pos * pos_b + A_neg * neg_b
            A = A.sum(dim=-1)
            if reduce_bias:
                b = b.sum(dim=[-1, -2])
            else:
                b = b.sum(dim=-1)
            return A, b

        lA, lb = _bound_oneside(last_lA, relu_lower_k_for_lA, relu_lower_b,
                                relu_upper_k, relu_upper_b,
                                weight, offset, reduce_bias)
        uA, ub = _bound_oneside(last_uA, relu_upper_k, relu_upper_b,
                                relu_lower_k_for_uA, relu_lower_b,
                                weight, offset, reduce_bias)

        return [(lA, uA), (None, None), (None, None)], lb, ub

    def _init_opt_parameters_impl(self, size_spec, **kwargs):
        alpha = torch.empty(2, size_spec, *self.init_lower_k.shape,
                            device=self.init_lower_k.device)
        alpha.data[:2] = self.init_lower_k
        return alpha

    def get_split_mask(self, lower, upper, input_index):
        offset = (
            self.inputs[2].forward_value
            if hasattr(self.inputs[2], 'forward_value')
            else self.inputs[2].forward()
        )
        return ((lower.unsqueeze(-1) < offset) & (upper.unsqueeze(-1) > offset)).any(dim=-1)


================================================
FILE: auto_LiRPA/operators/add_sub.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
from torch.nn import Module
from .base import *
from .constant import BoundConstant
from .solver_utils import grb


class BoundAdd(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        options = options or {}
        # FIXME: This is not the right way to enable patches mode.
        # Instead we must traverse the graph and determine when patches mode needs to be used.

        self.mode = options.get("conv_mode", "matrix")

    def forward(self, x, y):
        self.x_shape = x.shape
        self.y_shape = y.shape
        return x + y

    def bound_backward(self, last_lA, last_uA, x, y, **kwargs):
        def _bound_oneside(last_A, w):
            if last_A is None:
                return None
            return self.broadcast_backward(last_A, w)

        uA_x = _bound_oneside(last_uA, x)
        uA_y = _bound_oneside(last_uA, y)
        lA_x = _bound_oneside(last_lA, x)
        lA_y = _bound_oneside(last_lA, y)
        return [(lA_x, uA_x), (lA_y, uA_y)], 0, 0

    def bound_forward(self, dim_in, x, y):
        lb, ub = x.lb + y.lb, x.ub + y.ub

        def add_w(x_w, y_w, x_b, y_b):
            if x_w is None and y_w is None:
                return None
            elif x_w is not None and y_w is not None:
                return x_w + y_w
            elif y_w is None:
                return x_w + torch.zeros_like(y_b)
            else:
                return y_w + torch.zeros_like(x_b)

        lw = add_w(x.lw, y.lw, x.lb, y.lb)
        uw = add_w(x.uw, y.uw, x.ub, y.ub)

        return LinearBound(lw, lb, uw, ub)

    def interval_propagate(self, x, y):
        assert (not isinstance(y, Tensor))
        return x[0] + y[0], x[1] + y[1]

    def build_solver(self, *v, model, C=None, model_type="mip", solver_pkg="gurobi"):
        if isinstance(v[0], Tensor) and isinstance(v[1], Tensor):
            # constants if both inputs are tensors
            self.solver_vars = self.forward(v[0], v[1])
            return
        # we have both gurobi vars as inputs
        this_layer_shape = self.output_shape
        gvar_array1 = np.array(v[0])
        if isinstance(v[1], Tensor):
            var2 = v[1].cpu().numpy()
            # flatten to create vars and constrs first
            gvar_array1 = gvar_array1.reshape(-1)
            new_layer_gurobi_vars = []
            for neuron_idx, var1 in enumerate(gvar_array1):
                var = model.addVar(lb=-float('inf'), ub=float('inf'), obj=0,
                                   vtype=grb.GRB.CONTINUOUS,
                                   name=f'lay{self.name}_{neuron_idx}')
                model.addConstr(var == (var1 + var2), name=f'lay{self.name}_{neuron_idx}_eq')
                new_layer_gurobi_vars.append(var)
        else:
            gvar_array2 = np.array(v[1])
            assert gvar_array1.shape == gvar_array2.shape and gvar_array1.shape == this_layer_shape[1:]

            # flatten to create vars and constrs first
            gvar_array1 = gvar_array1.reshape(-1)
            gvar_array2 = gvar_array2.reshape(-1)
            new_layer_gurobi_vars = []
            for neuron_idx, (var1, var2) in enumerate(zip(gvar_array1, gvar_array2)):
                var = model.addVar(lb=-float('inf'), ub=float('inf'), obj=0,
                                vtype=grb.GRB.CONTINUOUS,
                                name=f'lay{self.name}_{neuron_idx}')
                model.addConstr(var == (var1 + var2), name=f'lay{self.name}_{neuron_idx}_eq')
                new_layer_gurobi_vars.append(var)
        # reshape to the correct list shape of solver vars
        self.solver_vars = np.array(new_layer_gurobi_vars).reshape(this_layer_shape[1:]).tolist()
        model.update()

    def build_gradient_node(self, grad_upstream):
        if not self.inputs[0].no_jacobian:
            grad0_node = AddGrad(self.inputs[0].output_shape if self.inputs[0].batch_dim != -1 else
                                 torch.Size((1,) + self.inputs[0].output_shape))
            grad0 = (grad0_node, (grad_upstream,), [])
        else:
            grad0 = None
        if not self.inputs[1].no_jacobian:
            grad1_node = AddGrad(self.inputs[1].output_shape if self.inputs[1].batch_dim != -1 else
                                 torch.Size((1,) + self.inputs[1].output_shape))
            grad1 = (grad1_node, (grad_upstream,), [])
        else:
            grad1 = None
        return [grad0, grad1]


class BoundSub(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        # FIXME: This is not the right way to enable patches mode. Instead we must traverse the graph and determine when patches mode needs to be used.
        self.mode = options.get("conv_mode", "matrix")

    def forward(self, x, y):
        self.x_shape = x.shape
        self.y_shape = y.shape
        return x - y

    def bound_backward(self, last_lA, last_uA, x, y, **kwargs):
        def _bound_oneside(last_A, w, sign=-1):
            if last_A is None:
                return None
            if isinstance(last_A, torch.Tensor):
                return self.broadcast_backward(sign * last_A, w)
            elif isinstance(last_A, Patches):
                if sign == 1:
                    # Patches shape requires no broadcast.
                    return last_A
                else:
                    # Multiply by the sign.
                    return last_A.create_similar(sign * last_A.patches)
            else:
                raise ValueError(f'Unknown last_A type {type(last_A)}')

        uA_x = _bound_oneside(last_uA, x, sign=1)
        uA_y = _bound_oneside(last_uA, y, sign=-1)
        lA_x = _bound_oneside(last_lA, x, sign=1)
        lA_y = _bound_oneside(last_lA, y, sign=-1)
        return [(lA_x, uA_x), (lA_y, uA_y)], 0, 0

    def bound_forward(self, dim_in, x, y):
        lb, ub = x.lb - y.ub, x.ub - y.lb

        def add_w(x_w, y_w, x_b, y_b):
            if x_w is None and y_w is None:
                return None
            elif x_w is not None and y_w is not None:
                return x_w + y_w
            elif y_w is None:
                return x_w + torch.zeros_like(y_b)
            else:
                return y_w + torch.zeros_like(x_b)

        # Some nodes such as BoundConstant does not have uw and lw.
        lw = add_w(x.lw, -y.uw if y.uw is not None else None, x.lb, y.lb)
        uw = add_w(x.uw, -y.lw if y.lw is not None else None, x.ub, y.ub)

        return LinearBound(lw, lb, uw, ub)

    def interval_propagate(self, x, y):
        return x[0] - y[1], x[1] - y[0]

    def build_solver(self, *v, model, C=None, model_type="mip", solver_pkg="gurobi"):
        if isinstance(v[0], Tensor) and isinstance(v[1], Tensor):
            # constants if both inputs are tensors
            self.solver_vars = self.forward(v[0], v[1])
            return
        # we have both gurobi vars as inputs
        this_layer_shape = self.output_shape
        gvar_array1 = np.array(v[0])
        gvar_array2 = np.array(v[1])
        assert gvar_array1.shape == gvar_array2.shape and gvar_array1.shape == this_layer_shape[1:]

        # flatten to create vars and constrs first
        gvar_array1 = gvar_array1.reshape(-1)
        gvar_array2 = gvar_array2.reshape(-1)
        new_layer_gurobi_vars = []
        for neuron_idx, (var1, var2) in enumerate(zip(gvar_array1, gvar_array2)):
            var = model.addVar(lb=-float('inf'), ub=float('inf'), obj=0,
                            vtype=grb.GRB.CONTINUOUS,
                            name=f'lay{self.name}_{neuron_idx}')
            model.addConstr(var == (var1 - var2), name=f'lay{self.name}_{neuron_idx}_eq')
            new_layer_gurobi_vars.append(var)

        # reshape to the correct list shape of solver vars
        self.solver_vars = np.array(new_layer_gurobi_vars).reshape(this_layer_shape[1:]).tolist()
        model.update()

    def build_gradient_node(self, grad_upstream):
        if not self.inputs[0].no_jacobian:
            grad_node_0 = AddGrad(self.inputs[0].output_shape if self.inputs[0].batch_dim != -1 else
                                  torch.Size((1,) + self.inputs[0].output_shape), w=1.0)
            grad0 = (grad_node_0, (grad_upstream,), [])
        else:
            grad0 = None
        if not self.inputs[1].no_jacobian:
            grad_node_1 = AddGrad(self.inputs[1].output_shape if self.inputs[1].batch_dim != -1 else
                                  torch.Size((1,) + self.inputs[1].output_shape), w=-1.0)
            grad1 = (grad_node_1, (grad_upstream,), [])
        else:
            grad1 = None
        return [grad0, grad1]


class AddGrad(Module):
    def __init__(self, input_shape, w=1.0):
        super().__init__()
        # We need the input shape to handle broadcasting.
        self.input_shape = input_shape
        self.w = w

    def forward(self, grad_last):
        return reduce_broadcast_dims(grad_last * self.w, self.input_shape)


================================================
FILE: auto_LiRPA/operators/base.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
""" Base class and functions for implementing bound operators"""
from typing import Optional, List
import warnings
import torch
import torch.nn as nn
from torch import Tensor
import numpy as np

from ..perturbations import *
from ..utils import *
from ..patches import *

torch._C._jit_set_profiling_executor(False)
torch._C._jit_set_profiling_mode(False)

epsilon = 1e-12


def not_implemented_op(node, func):
    message = (
        f'Function `{func}` of `{node}` is not supported yet.'
        ' Please help to open an issue at https://github.com/Verified-Intelligence/auto_LiRPA'
        ' or implement this function in auto_LiRPA/bound_ops.py'
        ' or auto_LiRPA/operators by yourself.')
    raise NotImplementedError(message)


class Interval(tuple):
    """Interval object for interval bound propagation."""

    # Subclassing tuple object so that all previous code can be reused.
    def __new__(self, lb=None, ub=None, ptb=None):
        return tuple.__new__(Interval, (lb, ub))

    def __init__(self, lb, ub, ptb=None):
        if ptb is None:
            self.ptb = None
            # `self.ptb == None` means that this interval
            # is not perturbed and it shall be treated as a constant and lb = ub.
            # To avoid mistakes, in this case the caller must make sure lb and ub are the same object.
            assert lb is ub
        else:
            if not isinstance(ptb, Perturbation):
                raise ValueError("ptb must be a Perturbation object or None. Got type {}".format(type(ptb)))
            else:
                self.ptb = ptb

    def __str__(self):
        return "({}, {}) with ptb={}".format(self[0], self[1], self.ptb)

    def __repr__(self):
        return "Interval(lb={}, ub={}, ptb={})".format(self[0], self[1], self.ptb)

    @staticmethod
    def make_interval(lb, ub, other=None):
        """Checking if the other interval is tuple, keep the perturbation."""
        if isinstance(other, Interval):
            return Interval(lb, ub, ptb=other.ptb)
        else:
            return (lb, ub)

    @staticmethod
    def get_perturbation(interval):
        """Given a tuple or Interval object, returns the norm and eps."""
        if isinstance(interval, Interval) and interval.ptb is not None:
            if isinstance(interval.ptb, PerturbationLpNorm):
                return interval.ptb.norm, interval.ptb.eps
            elif isinstance(interval.ptb, PerturbationSynonym):
                return torch.inf, 1.0
            elif isinstance(interval.ptb, PerturbationL0Norm):
                return 0, interval.ptb.eps, interval.ptb.ratio
            elif isinstance(interval.ptb, PerturbationLinear):
                return torch.inf, 0.0
            else:
                raise RuntimeError("get_perturbation() does not know how to handle {}".format(type(interval.ptb)))
        else:
            # Tuple object. Assuming L infinity norm lower and upper bounds.
            return torch.inf, np.nan


    @staticmethod
    def is_perturbed(interval):
        """Checking if a Interval or tuple object has perturbation enabled."""
        if isinstance(interval, Interval) and interval.ptb is None:
            return False
        else:
            return True


class Bound(nn.Module):
    r"""
    Base class for supporting the bound computation of an operator. Please see examples
    at `auto_LiRPA/operators`.

    Args:
        attr (dict): Attributes of the operator.

        inputs (list): A list of input nodes.

        output_index (int): The index in the output if the operator has multiple outputs. Usually output_index=0.

        options (dict): Bound options.

    Be sure to run `super().__init__(attr, inputs, output_index, options, device)`
    first in the `__init__` function.
    """

    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__()
        attr = {} if attr is None else attr
        inputs = [] if inputs is None else inputs
        options = {} if options is None else options
        self.name: Optional[str] = None
        self.output_name = []
        self.device = attr.get('device')
        self.attr = attr
        self.inputs: List['Bound'] = inputs
        self.output_index = output_index
        self.options = options
        # Mark if this node is used in the bound computation (from the output node).
        self.used = False
        self.forward_value = None
        self.output_shape = None
        self.from_input = False
        self.bounded = False
        self.IBP_rets = None
        self.requires_input_bounds = []
        self.from_complex_node = None
        # If True, when building the Jacobian graph, this node should be treated
        # as a constant and there is no need to further propagate Jacobian.
        self.no_jacobian = False
        # If True, when we are computing intermediate bounds for these ops,
        # we simply use IBP to propagate bounds from its input nodes
        # instead of CROWN. Currently only operators with a single input can be
        # supported.
        self.ibp_intermediate = False
        self.splittable = self.force_not_splittable = False
        # Determine if this node has a perturbed output or not. The function BoundedModule._mark_perturbed_nodes() will set this property.
        self.perturbed = False
        self.never_perturbed = False
        if options is not None and 'loss_fusion' in options:
            self.loss_fusion = options['loss_fusion']
        else:
            self.loss_fusion = False
        self.options = options
        # Use `default_interval_propagate`
        self.use_default_ibp = False
        # If set to true, the backward bound output of this node is 0.
        self.zero_backward_coeffs_l = False
        self.zero_backward_coeffs_u = False
        # If set to true, the A matrix accumulated on this node is 0.
        self.zero_lA_mtx = False
        self.zero_uA_mtx = False
        self.patches_start = False
        self.alpha_beta_update_mask = None
        self.is_final_node = False
        # By default, we assue this node has no batch dimension.
        # It will be updated in BoundedModule.get_forward_value().
        self.batch_dim = -1

        # The .lower and .upper properties are written to as part of the bound propagation.
        # Usually, in iterative refinement, each bound only depends on bounds previously
        # computed in the same iteration. However, this changes if INVPROP is used to incorporate
        # output constraints. Then, we also need bounds of layers *after* the currently bounded
        # layer. Therefore, we have to cache the older bounds.
        self._is_lower_bound_current = False
        self._lower = None
        self._is_upper_bound_current = False
        self._upper = None
        
        # A list containing the output ACTIVATIONS node from this node.
        # Please check backward_bound.py, forward_bound.py, batch_branch_and_bound.py for more info.
        self.output_activations = None

    def __repr__(self, attrs=None):
        inputs = ', '.join([node.name for node in self.inputs])
        ret = (f'{self.__class__.__name__}(name={self.name}, '
                f'inputs=[{inputs}], perturbed={self.perturbed}')
        if attrs is not None:
            for k, v in attrs.items():
                ret += f', {k}={v}'
        ret += ')'
        return ret

    @property
    def lower(self):
        return self._lower

    @lower.setter
    def lower(self, value):
        if not (value is None or isinstance(value, torch.Tensor)):
            raise TypeError(f'lower must be a tensor or None, got {type(value)}')
        if value is None:
            self._is_lower_bound_current = False
        else:
            self._is_lower_bound_current = True
        self._lower = value

    @property
    def upper(self):
        return self._upper

    @upper.setter
    def upper(self, value):
        if not (value is None or isinstance(value, torch.Tensor)):
            raise TypeError(f'upper must be a tensor or None, got {type(value)}')
        if value is None:
            self._is_upper_bound_current = False
        else:
            self._is_upper_bound_current = True
        self._upper = value

    def move_lower_and_upper_bounds_to_cache(self):
        if self._lower is not None:
            self._lower = self._lower.detach().requires_grad_(False)
            self._is_lower_bound_current = False
        if self._upper is not None:
            self._upper = self._upper.detach().requires_grad_(False)
            self._is_upper_bound_current = False

    def delete_lower_and_upper_bounds(self):
        self._lower = None
        self._upper = None
        self._is_lower_bound_current = False
        self._is_upper_bound_current = False

    def is_lower_bound_current(self):
        return self._is_lower_bound_current

    def is_upper_bound_current(self):
        return self._is_upper_bound_current

    def are_output_constraints_activated_for_layer(
        self: 'Bound',
        apply_output_constraints_to: Optional[List[str]],
    ):
        if self.is_final_node:
            return False
        if apply_output_constraints_to is None:
            return False
        for layer_type_or_name in apply_output_constraints_to:
            if layer_type_or_name.startswith('/'):
                if self.name == layer_type_or_name:
                    return True
            else:
                assert layer_type_or_name.startswith('Bound'), (
                    'To apply output constraints to tighten layer bounds, pass either the layer name '
                    '(starting with "/", e.g. "/input.7") or the layer type (starting with "Bound", '
                    'e.g. "BoundLinear")'
                )
                if type(self).__name__ == layer_type_or_name:
                    return True
        return False

    def init_gammas(self, num_constraints):
        if not self.are_output_constraints_activated_for_layer(
            self.options.get('optimize_bound_args', {}).get('apply_output_constraints_to', [])
        ):
            return
        assert len(self.output_shape) > 0, self
        neurons_in_this_layer = 1
        for d in self.output_shape[1:]:
            neurons_in_this_layer *= d
        init_gamma_value = 0.0
        # We need a different number of gammas depending on whether or not they are shared
        # However, to the code outside of this class, this should be transparent.
        # We create the correct number of gammas in gammas_underlying_tensor and if necessary
        # expand it to simulate a larger tensor. This is just a view, no additional memory is created.
        # By the outside, only .gammas should be used. However, we must take care to update this view
        # whenever gammas_underlying_tensor was changed (see clip_gammas)
        # Note that _set_gammas in optimized_bounds.py needs to refer to the gammas_underlying_tensor,
        # because that's the leaf tensor for which we need to compute gradients.
        if self.options.get('optimize_bound_args', {}).get('share_gammas', False):
            self.gammas_underlying_tensor = torch.full((2, num_constraints, 1), init_gamma_value, requires_grad=True, device=self.device)
            self.gammas = self.gammas_underlying_tensor.expand(-1, -1, neurons_in_this_layer)
        else:
            self.gammas_underlying_tensor = torch.full((2, num_constraints, neurons_in_this_layer), init_gamma_value, requires_grad=True, device=self.device)
            self.gammas = self.gammas_underlying_tensor

    def clip_gammas(self):
        if not hasattr(self, "gammas"):
            return
        self.gammas_underlying_tensor.data = torch.clamp(self.gammas_underlying_tensor.data, min=0.0)

        # If gammas are shared, self.gammas != self.gammas_underlying_tensor
        # We've changed self.gammas_underlying_tensor, those changes must be propagated to self.gammas
        neurons_in_this_layer = 1
        for d in self.output_shape[1:]:
            neurons_in_this_layer *= d
        if self.options.get('optimize_bound_args', {}).get('share_gammas', False):
            self.gammas = self.gammas_underlying_tensor.expand(-1, -1, neurons_in_this_layer)

    def is_input_perturbed(self, i=0):
        r"""Check if the i-th input is with perturbation or not."""
        return i < len(self.inputs) and self.inputs[i].perturbed

    def clear(self):
        """ Clear attributes when there is a new input to the network"""
        pass

    @property
    def input_name(self):
        return [node.name for node in self.inputs]

    def forward(self, *x):
        r"""
        Function for standard/clean forward.

        Args:
            x: A list of input values. The length of the list is equal to the number of input nodes.

        Returns:
            output (Tensor): The standard/clean output of this node.
        """
        return not_implemented_op(self, 'forward')

    def interval_propagate(self, *v):
        r"""
        Function for interval bound propagation (IBP) computation.

        There is a default function `self.default_interval_propagate(*v)` in the base class,
        which can be used if the operator is *monotonic*. To use it, set `self.use_default_ibp = True`
        in the `__init__` function, and the implementation of this function can be skipped.

        Args:
            v: A list of the interval bound of input nodes.
            Generally, for each element `v[i]`, `v[i][0]` is the lower interval bound,
            and `v[i][1]` is the upper interval bound.

        Returns:
            bound: The interval bound of this node, in a same format as v[i].
        """
        if self.use_default_ibp or self.never_perturbed:
            return self.default_interval_propagate(*v)
        else:
            return not_implemented_op(self, 'interval_propagate')

    def default_interval_propagate(self, *v):
        """Default IBP using the forward function.

        For unary monotonous functions or functions for altering shapes only
        but not values.
        """
        if len(v) == 0:
            return Interval.make_interval(self.forward(), self.forward())
        else:
            if len(v) > 1:
                for i in range(1, len(v)):
                    assert not self.is_input_perturbed(i)
            return Interval.make_interval(
                self.forward(v[0][0], *[vv[0] for vv in v[1:]]),
                self.forward(v[0][1], *[vv[0] for vv in v[1:]]), v[0])

    def bound_forward(self, dim_in, *x):
        r"""
        Function for forward mode bound propagation.

        Forward mode LiRPA computs a `LinearBound`
        instance representing the linear bound for each involved node.
        Major attributes of `LinearBound` include
        `lw`, `uw`, `lb`, `ub`, `lower`, and `upper`.

        `lw` and `uw` are coefficients of linear bounds w.r.t. model input.
        Their shape is `(batch_size, dim_in, *standard_shape)`,
        where `dim_in` is the total dimension of perturbed input nodes of the model,
        and `standard_shape` is the shape of the standard/clean output.
        `lb` and `ub` are bias terms of linear bounds, and their shape is equal
        to the shape of standard/clean output.
        `lower` and `upper` are concretized lower and upper bounds that will be
        computed later in BoundedModule.

        Args:
            dim_in (int): Total dimension of perturbed input nodes of the model.

            x: A list of the linear bound of input nodes. Each element in x is a `LinearBound` instance.

        Returns:
            bound (LinearBound): The linear bound of this node.
        """
        return not_implemented_op(self, 'bound_forward')

    def bound_dynamic_forward(self, *x, max_dim=None, offset=0):
        raise NotImplementedError(f'bound_dynamic_forward is not implemented for {self}.')

    def bound_backward(self, last_lA, last_uA, *x, **kwargs):
        r"""
        Function for backward mode bound propagation.

        Args:
            last_lA (Tensor): `A` matrix for lower bound computation propagated to this node. It can be `None` if lower bound is not needed.

            last_uA (Tensor): `A` matrix for upper bound computation propagated to this node. It can be `None` if upper bound is not needed.

            x: A list of input nodes, with x[i].lower and x[i].upper that can be used as pre-activation bounds.

        Returns:
            A: A list of A matrices for the input nodes. Each element is a tuple (lA, uA).

            lbias (Tensor): The bias term for lower bound computation, introduced by the linear relaxation of this node. .

            ubias (Tensor): The bias term for upper bound computation, introduced by the linear relaxation of this node.
        """
        return not_implemented_op(self, 'bound_backward')

    def broadcast_backward(self, A, x):
        """
        Adjust shape of A, adding or removing broadcast dimensions, based on the other operand x.

        Typically, A has [spec, batch, ...].
        The other operand x may have shape [batch, ...], or no batch dimension.
        Here the "..." dimensions may be different.
        We need to make sure the two match, by adding or removing dimensions in A.
        """
        if isinstance(A, Tensor):
            shape = x.output_shape
            if x.batch_dim == -1:
                # The other operand has no batch dimension. (e.g., constants).
                # Add batch dimension to it.
                if len(shape) < len(A.shape) - 1:
                    shape = torch.Size([1] + list(shape))
                else:
                    # The not-from-input operand has batch dimension.
                    # This can happen when the user explicitly unsqueezes the batch dimension on
                    # a constant tensor when building the computation graph.
                    warnings.warn(f"Constant operand of node \033[96m{self}\033[0m has batch dimension. "
                                  "Please check your model implementation. "
                                  "Constant operands \033[93mSHOULD NOT\033[0m have batch dimension.")
            A = reduce_broadcast_dims(A, shape)
        else:
            pass
        return A

    def build_gradient_node(self, grad_upstream):
        r"""
        Function for building the gradient node to bound the Jacobian.

        Args:
            grad_upstream: Upstream gradient in the gradient back-propagation.

        Returns:
            A list. Each item contains the following for computing the gradient
            of each input:
                module_grad (torch.nn.Module): Gradient node.

                grad_input (list): Inputs to the gradient node. Values do not
                matter. We only want the shapes.

                grad_extra_nodes (list): Extra nodes needed for the gradient.
        """
        return not_implemented_op(self, 'build_gradient_node')

    def get_bias(self, A, bias):
        if A is None:
            return 0
        if not Benchmarking:
            assert not isnan(A)
            assert not isnan(bias)
        if torch.isinf(bias).any():
            warnings.warn('There is an inf value in the bias of LiRPA bounds.')

        if isinstance(A, Tensor):
            if self.batch_dim != -1:
                bias_new = torch.einsum('sb...,b...->sb', A, bias)
            else:
                bias_new = torch.einsum('sb...,...->sb', A, bias)
            if isnan(bias_new):
                # NaN can be caused by 0 * inf, if 0 appears in `A` and inf appears in `bias`.
                # Force the whole bias to be 0, to avoid gradient issues.
                # FIXME maybe find a more robust solution.
                return 0
            else:
                # FIXME (09/17): handle the case for pieces.unstable_idx.
                return bias_new
        elif isinstance(A, eyeC):
            batch_size = A.shape[1]
            if self.batch_dim != -1:
                return bias.reshape(batch_size, -1).t()
            else:
                return bias.reshape(-1).unsqueeze(-1).repeat(1, batch_size)
        elif type(A) == Patches:
            # the shape of A.patches is [batch, L, out_c, in_c, K, K]
            if self.batch_dim != -1:
                # Input A patches has shape (spec, batch, out_h, out_w, in_c, H, W) or (unstable_size, batch, in_c, H, W).
                patches = A.patches
                # Here the size of bias is [batch_size, out_h, out_w, in_c, H, W]
                bias = inplace_unfold(bias, kernel_size=A.patches.shape[-2:], stride=A.stride, padding=A.padding, inserted_zeros=A.inserted_zeros, output_padding=A.output_padding)
                if A.unstable_idx is not None:
                    # Sparse bias has shape [unstable_size, batch_size, in_c, H, W]. No need to select over the out_c dimension.
                    bias = bias[:, A.unstable_idx[1], A.unstable_idx[2]]
                    # bias_new has shape (unstable_size, batch).
                    bias_new = torch.einsum('bschw,sbchw->sb', bias, patches)
                else:
                    # Sum over the in_c, H, W dimension. Use torch.einsum() to save memory, equal to:
                    # bias_new = (bias * patches).sum(-1,-2,-3).transpose(-2, -1)
                    # bias_new has shape (spec, batch, out_h, out_w).
                    bias_new = torch.einsum('bijchw,sbijchw->sbij', bias, patches)
            else:
                # Similar to BoundConstant. (BoundConstant does not have batch_dim).
                # FIXME (09/16): bias size is different for BoundConstant. We should use the same size!
                patches = A.patches
                bias_new = torch.sum(patches, dim=(-1, -2, -3)) * bias.to(self.device)
                # Return shape is (spec, batch, out_h, out_w) or (unstable_size, batch).
                return bias_new
            return bias_new
        else:
            return NotImplementedError()

    def make_axis_non_negative(self, axis, shape='input'):
        """Convert negative axis to non-negative axis.
        Args:
            axis (int or tuple or list): The axis to be converted.

            shape (str or torch.Size): The shape of the tensor. If 'input', use self.input_shape.
                If 'output', use self.output_shape. Otherwise, it should be a torch.Size object.
                For example, if the tensor shape is (2, 3, 4), then axis -1 will be converted to 2.
                For the "squeeze" operation, the shape should be the 'input' shape.
                While for the "unsqueeze" operation, the shape should be the 'output' shape.

        Returns:
            axis (int or tuple): The non-negative axis.
        """
        if isinstance(axis, (tuple, list)):
            return tuple(sorted([self.make_axis_non_negative(item, shape)
                                 for item in axis]))
        if shape == 'input':
            shape = self.input_shape
        elif shape == 'output':
            shape = self.output_shape
        else:
            assert isinstance(shape, torch.Size)
        if axis < 0:
            return axis + len(shape)
        else:
            return axis

    def update_requires_input_bounds(self):
        """Update requires_input_bounds.

        This function is called once we know if the input nodesare perturbed.
        """
        pass

    def clamp_interim_bounds(self):
        """Clamp intermediate bounds."""
        pass

    def check_constraint_available(self, node, flag=False):
        if hasattr(node, 'cstr_interval'):
            flag = True
        for n in node.inputs:
            if not n.from_input:
                flag = flag or self.check_constraint_available(n, flag)
        return flag

    def _ibp_constraint(self, node: 'Bound', delete_bounds_after_use=False):
        def _delete_unused_bounds(node_list):
            """Delete bounds from input layers after use to save memory. Used when
            sparse_intermediate_bounds_with_ibp is true."""
            if delete_bounds_after_use:
                for n in node_list:
                    del n.cstr_interval
                    del n.cstr_lower
                    del n.cstr_upper

        if not node.perturbed and hasattr(node, 'forward_value'):
            node.cstr_lower, node.cstr_upper = node.cstr_interval = (
                node.forward_value, node.forward_value)

        to_be_deleted_bounds = []
        if not hasattr(node, 'cstr_interval'):
            for n in node.inputs:
                if not hasattr(n, 'cstr_interval'):
                    # Node n does not have interval bounds; we must compute it.
                    self._ibp_constraint(
                        n, delete_bounds_after_use=delete_bounds_after_use)
                    to_be_deleted_bounds.append(n)
            inp = [n_pre.cstr_interval for n_pre in node.inputs]
            node.cstr_interval = node.interval_propagate(*inp)

            node.cstr_lower, node.cstr_upper = node.cstr_interval
            if isinstance(node.cstr_lower, torch.Size):
                node.cstr_lower = torch.tensor(node.cstr_lower)
                node.cstr_interval = (node.cstr_lower, node.cstr_upper)
            if isinstance(node.cstr_upper, torch.Size):
                node.cstr_upper = torch.tensor(node.cstr_upper)
                node.cstr_interval = (node.cstr_lower, node.cstr_upper)

        if node.is_lower_bound_current():
            node.lower = torch.where(node.lower >= node.cstr_lower, node.lower,
                            node.cstr_lower)
            node.upper = torch.where(node.upper <= node.cstr_upper, node.upper,
                            node.cstr_upper)
            node.interval = (node.lower, node.upper)

        _delete_unused_bounds(to_be_deleted_bounds)
        return node.cstr_interval

    def _check_weight_perturbation(self):
        weight_perturbation = False
        for n in self.inputs[1:]:
            if hasattr(n, 'perturbation'):
                if n.perturbation is not None:
                    weight_perturbation = True
        if weight_perturbation:
            self.requires_input_bounds = list(range(len(self.inputs)))
        else:
            self.requires_input_bounds = []
        return weight_perturbation

    def non_deter_wrapper(self, op, *args, **kwargs):
        """Some operations are non-deterministic and deterministic mode will fail.
        So we temporary disable it."""
        if self.options.get('deterministic', False):
            torch.use_deterministic_algorithms(False)
        ret = op(*args, **kwargs)
        if self.options.get('deterministic', False):
            torch.use_deterministic_algorithms(True)
        return ret

    def non_deter_scatter_add(self, *args, **kwargs):
        return self.non_deter_wrapper(torch.scatter_add, *args, **kwargs)

    def non_deter_index_select(self, *args, **kwargs):
        return self.non_deter_wrapper(torch.index_select, *args, **kwargs)


================================================
FILE: auto_LiRPA/operators/bivariate.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
""" Bivariate operators"""
import torch
from torch import Tensor
from torch.nn import Module
from typing import Dict, Optional
from .base import *
from .activation_base import BoundOptimizableActivation
from .convex_concave import BoundSqrt
from .clampmult import multiply_by_A_signs
from ..utils import *


class MulHelper:
    """Handle linear relaxation for multiplication.

    This helper can be used by BoundMul, BoundMatMul,
    BoundLinear (with weight perturbation).
    """

    def __init__(self):
        pass

    @staticmethod
    def interpolated_relaxation(x_l: Tensor, x_u: Tensor,
                                y_l: Tensor, y_u: Tensor,
                                r_l: Optional[Tensor] = None,
                                r_u: Optional[Tensor] = None,
                                middle: bool = False,
                               ) -> Tuple[Tensor, Tensor, Tensor,
                                          Tensor, Tensor, Tensor]:
        """Interpolate two optimal linear relaxations for optimizable bounds."""
        if r_l is None and r_u is None:
            if middle:
                # This option is equivalent to optimized linear relaxation
                # with 0.5 as the fixed parameter.
                # It interpolates two valid linear relaxations.
                # See Appendix C in https://openreview.net/pdf?id=BJxwPJHFwS
                alpha_l = (y_l - y_u) * 0.5 + y_u
                beta_l = (x_l - x_u) * 0.5 + x_u
                gamma_l = (y_u * x_u - y_l * x_l) * 0.5 - y_u * x_u
                alpha_u = (y_u - y_l) * 0.5 + y_l
                beta_u = (x_l - x_u) * 0.5 + x_u
                gamma_u = (y_l * x_u - y_u * x_l) * 0.5 - y_l * x_u
            else:
                alpha_l, beta_l, gamma_l = y_l, x_l, -y_l * x_l
                alpha_u, beta_u, gamma_u = y_u, x_l, -y_u * x_l
            return alpha_l, beta_l, gamma_l, alpha_u, beta_u, gamma_u
        else:
            assert isinstance(r_l, Tensor) and isinstance(r_u, Tensor)
            # TODO (for zhouxing/qirui): this function may benefit from JIT,
            # because it has many element-wise operation which can be fused.
            # Need to benchmark and see performance.
            alpha_l = (y_l - y_u) * r_l + y_u
            beta_l = (x_l - x_u) * r_l + x_u
            gamma_l = (y_u * x_u - y_l * x_l) * r_l - y_u * x_u
            alpha_u = (y_u - y_l) * r_u + y_l
            beta_u = (x_l - x_u) * r_u + x_u
            gamma_u = (y_l * x_u - y_u * x_l) * r_u - y_l * x_u
            return alpha_l, beta_l, gamma_l, alpha_u, beta_u, gamma_u

    @staticmethod
    def get_relaxation(x_l: Tensor, x_u: Tensor, y_l: Tensor, y_u: Tensor,
                       opt_stage: Optional[str],
                       alphas: Optional[Dict[str, Tensor]],
                       start_name: Optional[str],
                       middle: bool = False,
                      ) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]:
        if opt_stage in ['opt', 'reuse']:
            assert x_l.ndim == y_l.ndim
            ns = start_name
            alphas[ns].data[:] = alphas[ns].data[:].clamp(min=0, max=1)
            return MulHelper.interpolated_relaxation(
                x_l, x_u, y_l, y_u, alphas[ns][:2], alphas[ns][2:4])
        else:
            return MulHelper.interpolated_relaxation(
                x_l, x_u, y_l, y_u, middle=middle)

    @staticmethod
    def get_forward_relaxation(x_l, x_u, y_l, y_u, opt_stage, alpha, start_name):
        # Broadcast
        # FIXME perhaps use a more efficient way
        x_l = x_l + torch.zeros_like(y_l)
        x_u = x_u + torch.zeros_like(y_u)
        y_l = y_l + torch.zeros_like(x_l)
        y_u = y_u + torch.zeros_like(x_u)
        return MulHelper.get_relaxation(x_l, x_u, y_l, y_u, opt_stage, alpha, start_name)

    @staticmethod
    def _get_gap(x, y, alpha, beta):
        return x * y - alpha * x - beta * y


class BoundMul(BoundOptimizableActivation):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.splittable = True
        self.mul_helper = MulHelper()
        if options is None:
            options = {}
        self.middle = options.get('mul', {}).get('middle', False)

    def forward(self, x, y):
        self.x_shape = x.shape
        self.y_shape = y.shape
        return x * y

    def get_relaxation_opt(self, x_l, x_u, y_l, y_u):
        return self.mul_helper.get_relaxation(
            x_l, x_u, y_l, y_u, self.opt_stage, getattr(self, 'alpha', None),
            getattr(self, '_start', None), middle=self.middle)

    def _init_opt_parameters_impl(self, size_spec, **kwargs):
        """Implementation of init_opt_parameters for each start_node."""
        x_l = self.inputs[0].lower
        y_l = self.inputs[1].lower
        assert x_l.ndim == y_l.ndim
        shape = [max(x_l.shape[i], y_l.shape[i]) for i in range(x_l.ndim)]
        alpha = torch.ones(4, size_spec, *shape, device=x_l.device)
        return alpha

    def _is_softmax(self):
        """This multiplication comes from softmax.

        It is the division converted to BoundMul + BoundReciprocal.
        """
        return (
            self.from_complex_node == 'BoundSoftmax'
            and type(self.inputs[0]).__name__ == 'BoundExp'
            and type(self.inputs[1]).__name__ == 'BoundReciprocal'
            and type(self.inputs[1].inputs[0]).__name__ == 'BoundReduceSum'
            and type(self.inputs[1].inputs[0].inputs[0]).__name__ == 'BoundExp')

    def bound_relax(self, x, y, init=False, dim_opt=None):
        if init:
            pass
        (alpha_l, beta_l, gamma_l,
         alpha_u, beta_u, gamma_u) = self.get_relaxation_opt(
            x.lower, x.upper, y.lower, y.upper)

        # Check NaN which can happen in softmax if Exp's bounds are too loose
        if self._is_softmax():
            assert alpha_l.shape[:-1] == beta_l.shape[:-1]
            assert alpha_l.shape[-1] == 1 or alpha_l.shape[-1] == beta_l.shape[-1]
            assert beta_l.shape == gamma_l.shape
            mask = (alpha_l.isnan().expand(beta_l.shape)
                    | alpha_l.isinf().expand(beta_l.shape)
                    | beta_l.isnan() | beta_l.isinf()
                    | gamma_l.isnan() | gamma_l.isinf())
            if mask.any():
                alpha_l = alpha_l.clone()
                alpha_l[mask.any(dim=-1)] = 0
                beta_l = beta_l.clone()
                beta_l[mask] = 0
                gamma_l = gamma_l.clone()
                gamma_l[mask] = 0

            assert alpha_u.shape[:-1] == beta_u.shape[:-1]
            assert alpha_u.shape[-1] == 1 or alpha_u.shape[-1] == beta_u.shape[-1]
            assert beta_u.shape == gamma_u.shape
            mask = (alpha_u.isnan().expand(beta_u.shape)
                    | alpha_u.isinf().expand(beta_u.shape)
                    | beta_u.isnan() | beta_u.isinf()
                    | gamma_u.isnan() | gamma_u.isinf())
            if mask.any():
                alpha_u = alpha_u.clone()
                alpha_u[mask.any(dim=-1)] = 0
                beta_u = beta_u.clone()
                beta_u[mask] = 0
                gamma_u = gamma_u.clone()
                gamma_u[mask] = 1.

        self.lw = [alpha_l, beta_l]
        self.lb = gamma_l
        self.uw = [alpha_u, beta_u]
        self.ub = gamma_u

    @staticmethod
    def _multiply_by_const(x, const):
        if isinstance(x, torch.Tensor):
            return x * const
        elif isinstance(x, Patches):
            # Multiplies patches by a const. Assuming const is a tensor, and it must be in nchw format.
            assert isinstance(const, torch.Tensor) and const.ndim == 4
            if (const.size(0) == 1 or const.size(0) == x.patches.size(1)) and const.size(1) == x.patches.size(-3) and const.size(2) == const.size(3) == 1:
                # The case that we can do channel-wise broadcasting multiplication
                # Shape of const: (batch, in_c, 1, 1)
                # Shape of patches when unstable_idx is None: (spec, batch, in_c, patch_h, patch_w)
                # Shape of patches when unstable_idx is not None: (out_c, batch, out_h, out_w, in_c, patch_h, patch_w)
                const_reshaped = const
            else:
                assert x.unstable_idx is None and (x.padding == 0 or x.padding == [0,0,0,0]) and x.stride == 1 and x.patches.size(-1) == x.patches.size(-2) == 1
                # The assumed dimension is (out_c, N, out_h, out_w, in_c, 1, 1) with padding =1 and stride = 0.
                # In this special case we can directly multiply.
                # After reshape it is (1, N, H, W, C, 1, 1)
                const_reshaped = const.permute(0, 2, 3, 1).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
            return x.create_similar(x.patches * const_reshaped)
        else:
            raise ValueError(f'Unsupported x type {type(x)}')

    def bound_backward_constant(self, last_lA, last_uA, x, y, op=None,
                                reduce_bias=True, **kwargs):
        assert reduce_bias
        op = BoundMul._multiply_by_const if op is None else op
        # Handle the case of multiplication by a constant.
        factor = None
        if x.perturbed:
            factor = y.forward_value
        if y.perturbed:
            factor = x.forward_value
        # No need to compute A matrix if it is Constant.
        lAx = (None if not x.perturbed or last_lA is None
               else self.broadcast_backward(op(last_lA, factor), x))
        uAx = (None if not x.perturbed or last_uA is None
               else self.broadcast_backward(op(last_uA, factor), x))
        lAy = (None if not y.perturbed or last_lA is None
               else self.broadcast_backward(op(last_lA, factor), y))
        uAy = (None if not y.perturbed or last_uA is None
               else self.broadcast_backward(op(last_uA, factor), y))
        return [(lAx, uAx), (lAy, uAy)], 0., 0.

    def bound_backward(self, last_lA, last_uA, x, y, start_node=None, **kwargs):
        if start_node is not None:
            self._start = start_node.name
        if self.is_linear_op:
            ret = self.bound_backward_constant(last_lA, last_uA, x, y, **kwargs)
        else:
            ret = self.bound_backward_both_perturbed(
                last_lA, last_uA, x, y, **kwargs)
        return ret

    def bound_backward_both_perturbed(self, last_lA, last_uA, x, y,
                                      reduce_bias=True, **kwargs):
        self.bound_relax(x, y)

        def _bound_oneside(last_A, alpha_pos, beta_pos, gamma_pos,
                           alpha_neg, beta_neg, gamma_neg, opt=False):
            if last_A is None:
                return None, None, 0

            if type(last_A) == Patches:
                assert reduce_bias
                assert last_A.identity == 0
                # last_A shape: [out_c, batch_size, out_h, out_w, in_c, H, W].
                # Here out_c is the spec dimension.
                # for patches mode, we need to unfold the alpha_pos/neg and beta_pos/neg
                alpha_pos = maybe_unfold_patches(alpha_pos, last_A)
                alpha_neg = maybe_unfold_patches(alpha_neg, last_A)
                beta_pos = maybe_unfold_patches(beta_pos, last_A)
                beta_neg = maybe_unfold_patches(beta_neg, last_A)
                gamma_pos = maybe_unfold_patches(gamma_pos, last_A)
                gamma_neg = maybe_unfold_patches(gamma_neg, last_A)
                A_x, bias = multiply_by_A_signs(
                    last_A, alpha_pos, alpha_neg, gamma_pos, gamma_neg)
                A_y, _ = multiply_by_A_signs(
                    last_A, beta_pos, beta_neg, None, None)
            elif type(last_A) == Tensor:
                last_A_pos, last_A_neg = last_A.clamp(min=0), last_A.clamp(max=0)
                A_x, _ = multiply_by_A_signs(last_A, alpha_pos, alpha_neg, None, None)
                A_y, _ = multiply_by_A_signs(last_A, beta_pos, beta_neg, None, None)
                A_x = self.broadcast_backward(A_x, x)
                A_y = self.broadcast_backward(A_y, y)
                if reduce_bias:
                    if opt:
                        bias = (torch.einsum('sb...,sb...->sb',
                                             last_A_pos, gamma_pos)
                                + torch.einsum('sb...,sb...->sb',
                                               last_A_neg, gamma_neg))
                    else:
                        bias = (self.get_bias(last_A_pos, gamma_pos.squeeze(0)) +
                            self.get_bias(last_A_neg, gamma_neg.squeeze(0)))
                else:
                    assert not opt
                    bias = last_A_pos * gamma_pos + last_A_neg * gamma_neg
                    assert len(x.output_shape) == bias.ndim - 1
                    assert len(y.output_shape) == bias.ndim - 1
                    bias_x = bias_y = bias
                    for i in range(2, bias.ndim):
                        if bias_x.shape[i] != x.output_shape[i - 1]:
                            assert x.output_shape[i - 1] == 1
                            bias_x = bias_x.sum(i, keepdim=True)
                    for i in range(2, bias.ndim):
                        if bias_y.shape[i] != y.output_shape[i - 1]:
                            assert y.output_shape[i - 1] == 1
                            bias_y = bias_y.sum(i, keepdim=True)
                    bias = (bias_x, bias_y)
            else:
                raise NotImplementedError(last_A)
            return A_x, A_y, bias

        alpha_l, beta_l, gamma_l = self.lw[0], self.lw[1], self.lb
        alpha_u, beta_u, gamma_u = self.uw[0], self.uw[1], self.ub

        if self.opt_stage in ['opt', 'reuse']:
            lA_x, lA_y, lbias = _bound_oneside(
                last_lA, alpha_l[0], beta_l[0], gamma_l[0],
                alpha_u[0], beta_u[0], gamma_u[0], opt=True)
            uA_x, uA_y, ubias = _bound_oneside(
                last_uA, alpha_u[1], beta_u[1], gamma_u[1],
                alpha_l[1], beta_l[1], gamma_l[1], opt=True)
        else:
            alpha_l, alpha_u = alpha_l.unsqueeze(0), alpha_u.unsqueeze(0)
            beta_l, beta_u = beta_l.unsqueeze(0), beta_u.unsqueeze(0)
            gamma_l, gamma_u = gamma_l.unsqueeze(0), gamma_u.unsqueeze(0)
            lA_x, lA_y, lbias = _bound_oneside(
                last_lA, alpha_l, beta_l, gamma_l, alpha_u, beta_u, gamma_u)
            uA_x, uA_y, ubias = _bound_oneside(
                last_uA, alpha_u, beta_u, gamma_u, alpha_l, beta_l, gamma_l)

        return [(lA_x, uA_x), (lA_y, uA_y)], lbias, ubias

    def bound_forward(self, dim_in, x, y):
        if self.is_linear_op:
            if not self.inputs[0].perturbed:
                return self.bound_forward_constant(x, y, self.inputs[0].batch_dim != -1)
            elif not self.inputs[1].perturbed:
                return self.bound_forward_constant(y, x, self.inputs[1].batch_dim != -1)
            else:
                assert False, "When is_linear_op is True, at least one input should be constant."
        return self.bound_forward_both_perturbed(dim_in, x, y)
    
    def bound_forward_constant(self, x, y, batched_constant):
        # x is constant
        const = x.lb
        const_pos, const_neg = const.clamp(min=0), const.clamp(max=0)
        lb = const_pos * y.lb + const_neg * y.ub
        ub = const_pos * y.ub + const_neg * y.lb
        if batched_constant:
            # If x is batched, its first dimension will be the batch dimension
            # We need to unsqueeze an extra dimension to align the batch dimension
            # x and y both have shape (B, a_1, a_2, ..., a_n)
            # lw/uw has shape (B, dim_in, a_1, a_2, ..., a_n)
            const_pos = const_pos.unsqueeze(1)
            const_neg = const_neg.unsqueeze(1)
        lw = const_pos * y.lw + const_neg * y.uw
        uw = const_pos * y.uw + const_neg * y.lw
        return LinearBound(lw, lb, uw, ub)

    def bound_forward_both_perturbed(self, dim_in, x, y):
        x_lw, x_lb, x_uw, x_ub = x.lw, x.lb, x.uw, x.ub
        y_lw, y_lb, y_uw, y_ub = y.lw, y.lb, y.uw, y.ub

        (alpha_l, beta_l, gamma_l,
         alpha_u, beta_u, gamma_u) = MulHelper.get_forward_relaxation(
             x.lower, x.upper, y.lower, y.upper, self.opt_stage, getattr(self, 'alpha', None), self._start)

        if x_lw is None: x_lw = 0
        if y_lw is None: y_lw = 0
        if x_uw is None: x_uw = 0
        if y_uw is None: y_uw = 0

        lw = alpha_l.unsqueeze(1).clamp(min=0) * x_lw + alpha_l.unsqueeze(1).clamp(max=0) * x_uw
        lw = lw + beta_l.unsqueeze(1).clamp(min=0) * y_lw + beta_l.unsqueeze(1).clamp(max=0) * y_uw
        lb = (alpha_l.clamp(min=0) * x_lb + alpha_l.clamp(max=0) * x_ub +
             beta_l.clamp(min=0) * y_lb + beta_l.clamp(max=0) * y_ub + gamma_l)
        uw = alpha_u.unsqueeze(1).clamp(max=0) * x_lw + alpha_u.unsqueeze(1).clamp(min=0) * x_uw
        uw = uw + beta_u.unsqueeze(1).clamp(max=0) * y_lw + beta_u.unsqueeze(1).clamp(min=0) * y_uw
        ub = (alpha_u.clamp(max=0) * x_lb + alpha_u.clamp(min=0) * x_ub +
             beta_u.clamp(max=0) * y_lb + beta_u.clamp(min=0) * y_ub + gamma_u)

        return LinearBound(lw, lb, uw, ub)

    @staticmethod
    def interval_propagate_constant(x, y, op=lambda x, const: x * const):
        # x is constant
        const = x[0]
        inp_lb = y[0]
        inp_ub = y[1]
        pos_mask = (const > 0).to(dtype=inp_lb.dtype)
        neg_mask = 1. - pos_mask
        lb = op(inp_lb, const * pos_mask) + op(inp_ub, const * neg_mask)
        ub = op(inp_ub, const * pos_mask) + op(inp_lb, const * neg_mask)
        return lb, ub

    def interval_propagate(self, x, y):
        if self.is_linear_op:
            if not self.inputs[0].perturbed:
                return self.interval_propagate_constant(x, y)
            elif not self.inputs[1].perturbed:
                return self.interval_propagate_constant(y, x)
            else:
                assert False, "When is_linear_op is True, at least one input should be constant."
        else:
            lower, upper = self.interval_propagate_both_perturbed(x, y)
            if self._is_softmax():
                lower = lower.clamp(min=0)
                upper = upper.clamp(max=1)
            return lower, upper

    @staticmethod
    def interval_propagate_both_perturbed(*v):
        x, y = v[0], v[1]
        if x is y:
            # A shortcut for x * x.
            h_L, h_U = v[0]
            r0 = h_L * h_L
            r1 = h_U * h_U
            # When h_L < 0, h_U > 0, lower bound is 0.
            # When h_L < 0, h_U < 0, lower bound is h_U * h_U.
            # When h_L > 0, h_U > 0, lower bound is h_L * h_L.
            l = F.relu(h_L) - F.relu(-h_U)
            return l * l, torch.max(r0, r1)

        r0, r1, r2, r3 = x[0] * y[0], x[0] * y[1], x[1] * y[0], x[1] * y[1]
        lower = torch.min(torch.min(r0, r1), torch.min(r2, r3))
        upper = torch.max(torch.max(r0, r1), torch.max(r2, r3))

        return lower, upper

    def build_solver(self, *v, model, C=None, model_type="mip", solver_pkg="gurobi"):
        if isinstance(v[0], Tensor):
            self.solver_vars = self.forward(*v)
            return
        gvar_array = np.array(v[0])
        gvar_array = gvar_array * v[1].cpu().numpy()
        self.solver_vars = gvar_array.tolist()

    def update_requires_input_bounds(self):
        self.is_linear_op = False
        for inp in self.inputs:
            if not inp.perturbed:
                # If any of the two inputs are constant, we do not need input bounds.
                self.is_linear_op = True
        if self.is_linear_op:
            # One input is constant; no bounds required.
            self.requires_input_bounds = []
            self.splittable = False
        else:
            # Both inputs are perturbed. Need relaxation.
            self.requires_input_bounds = [0, 1]
            if not self.force_not_splittable:
                self.splittable = True
        
    def build_gradient_node(self, grad_upstream):
        grad_node_0 = MulGrad(self.inputs[0].output_shape if self.inputs[0].batch_dim != -1 else
                              torch.Size((1,) + self.inputs[0].output_shape))
        grad_node_1 = MulGrad(self.inputs[1].output_shape if self.inputs[1].batch_dim != -1 else
                              torch.Size((1,) + self.inputs[1].output_shape))
        return [(grad_node_0, (grad_upstream, self.inputs[1].forward_value), [self.inputs[1]]),
                (grad_node_1, (grad_upstream, self.inputs[0].forward_value), [self.inputs[0]])]


class MulGrad(Module):
    def __init__(self, input_shape):
        super().__init__()
        # We need the input shape to handle broadcasting
        self.input_shape = input_shape

    def forward(self, grad_last, y):
        # z = x * y
        # ∂z/∂x = y
        if y.ndim > 0:
            # If y is not a constant scalar, its second dimension is for spec
            y = y.unsqueeze(1)
        return reduce_broadcast_dims(grad_last * y, self.input_shape)


class BoundDiv(Bound):

    def forward(self, x, y):
        # FIXME (05/11/2022): ad-hoc implementation for layer normalization
        if isinstance(self.inputs[1], BoundSqrt):
            input = self.inputs[0].inputs[0]
            x = input.forward_value
            n = input.forward_value.shape[-1]

            dev = x * (1. - 1. / n) - (x.sum(dim=-1, keepdim=True) - x) / n
            dev_sqr = dev ** 2
            s = (dev_sqr.sum(dim=-1, keepdim=True) - dev_sqr) / dev_sqr.clamp(min=epsilon)
            sqrt = torch.sqrt(1. / n * (s + 1))
            return torch.sign(dev) * (1. / sqrt)

        return x / y


================================================
FILE: auto_LiRPA/operators/clampmult.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
"""Element multiplication with the A matrix based on its sign."""
import torch
from typing import Optional, Tuple
from torch import Tensor
from ..patches import Patches


torch._C._jit_set_profiling_executor(False)
torch._C._jit_set_profiling_mode(False)


class ClampedMultiplication(torch.autograd.Function):
    @staticmethod
    @torch.no_grad()
    @torch.jit.script
    def clamp_mutiply_forward(A: Tensor, d_pos: Tensor, d_neg: Tensor,
            b_pos: Optional[Tensor], b_neg: Optional[Tensor], patches_mode: bool,
            reduce_bias: bool = False, same_slope: bool = False
        ) -> Tuple[Tensor, Tensor]:
        """Forward operations; actually the same as the reference implementation."""
        A_pos = A.clamp(min=0)
        A_neg = A.clamp(max=0)    
        if same_slope:
            # "same-slope" option is enabled; lower and upper bounds use the same A.
            A_new = d_pos * A          
        else:  
            A_new = d_pos * A_pos + d_neg * A_neg
        
        bias_pos = bias_neg = torch.zeros(
                (), dtype=A_new.dtype, device=A_new.device)
        if b_pos is not None:
            if not reduce_bias:
                bias_pos = A_pos * b_pos
            else:
                if patches_mode:
                    bias_pos = torch.einsum('sb...chw,sb...chw->sb...', A_pos, b_pos)
                else:
                    bias_pos = torch.einsum('sb...,sb...->sb', A_pos, b_pos)
        if b_neg is not None:
            if not reduce_bias:
                bias_neg = A_neg * b_neg
            else:
                if patches_mode:
                    bias_neg = torch.einsum('sb...chw,sb...chw->sb...', A_neg, b_neg)
                else:
                    bias_neg = torch.einsum('sb...,sb...->sb', A_neg, b_neg)
        return A_new, bias_pos + bias_neg

    @staticmethod
    @torch.no_grad()
    @torch.jit.script
    def clamp_mutiply_backward(A: Tensor, d_pos: Tensor, d_neg: Tensor,
            b_pos: Optional[Tensor], b_neg: Optional[Tensor],
            grad_output_A: Tensor, grad_output_bias: Optional[Tensor], same_slope: bool = False
        ) -> Tuple[Tensor, Tensor, Tensor, Optional[Tensor], Optional[Tensor],
                   None, None, None]:
        """Improved backward operation. This should be better than the backward
        function generated by Pytorch."""
        if grad_output_bias is not None:
            extension_dim = len(A.shape) - len(grad_output_bias.shape)
            grad_output_bias = grad_output_bias.view(
                grad_output_bias.shape + (1, ) * extension_dim)
        A_pos_mask = (A >= 0).to(dtype=grad_output_A.dtype)
        A_neg_mask = 1. - A_pos_mask
        A_pos_grad_output_A = A_pos_mask * grad_output_A
        A_neg_grad_output_A = A_neg_mask * grad_output_A
        # Although d_pos is d_neg, we still need to get gd_pos and gd_neg separately.
        gd_pos = A * A_pos_grad_output_A
        gd_neg = A * A_neg_grad_output_A
        if b_pos is not None and b_neg is not None and grad_output_bias is not None:
            A_pos_grad_output_bias = A_pos_mask * grad_output_bias
            A_neg_grad_output_bias = A_neg_mask * grad_output_bias
            gb_neg = A * A_neg_grad_output_bias
            gb_pos = A * A_pos_grad_output_bias
            if same_slope:
                gA = (d_pos * grad_output_A
                      + b_pos * A_pos_grad_output_bias
                      + b_neg * A_neg_grad_output_bias)
            else:
                gA = (d_pos * A_pos_grad_output_A
                    + d_neg * A_neg_grad_output_A
                    + b_pos * A_pos_grad_output_bias
                    + b_neg * A_neg_grad_output_bias)
        elif b_neg is not None and grad_output_bias is not None:
            A_neg_grad_output_bias = A_neg_mask * grad_output_bias
            gb_neg = A * A_neg_grad_output_bias
            gb_pos = None
            if same_slope:
                gA = (d_pos * grad_output_A
                      + b_neg * A_neg_grad_output_bias)
            else:
                gA = (d_pos * A_pos_grad_output_A
                    + d_neg * A_neg_grad_output_A
                    + b_neg * A_neg_grad_output_bias)
        elif b_pos is not None and grad_output_bias is not None:
            A_pos_grad_output_bias = A_pos_mask * grad_output_bias
            gb_pos = A * A_pos_grad_output_bias
            gb_neg = None
            if same_slope:
                gA = (d_pos * grad_output_A
                      + b_pos * A_pos_grad_output_bias)
            else:
                gA = (d_pos * A_pos_grad_output_A + d_neg * A_neg_grad_output_A
                    + b_pos * A_pos_grad_output_bias)
        else:
            if same_slope:
                gA = d_pos * grad_output_A
            else:
                gA = d_pos * A_pos_grad_output_A + d_neg * A_neg_grad_output_A
            gb_pos = gb_neg = None
        return gA, gd_pos, gd_neg, gb_pos, gb_neg, None, None, None

    @staticmethod
    def forward(ctx, A, d_pos, d_neg, b_pos, b_neg, patches_mode, reduce_bias=True, same_slope=False):
        # No need to save the intermediate A_pos, A_neg as they have been fused into the computation.
        ctx.save_for_backward(A, d_pos, d_neg, b_pos, b_neg)
        ctx.patches_mode = patches_mode
        ctx.reduce_bias = reduce_bias
        ctx.same_slope = same_slope
        return ClampedMultiplication.clamp_mutiply_forward(
            A, d_pos, d_neg, b_pos, b_neg, patches_mode, reduce_bias, same_slope)

    @staticmethod
    def backward(ctx, grad_output_A, grad_output_bias):
        A, d_pos, d_neg, b_pos, b_neg = ctx.saved_tensors
        assert ctx.reduce_bias
        return ClampedMultiplication.clamp_mutiply_backward(
            A, d_pos, d_neg, b_pos, b_neg,
            grad_output_A, grad_output_bias, ctx.same_slope)


def multiply_by_A_signs(A, d_pos, d_neg, b_pos, b_neg, contiguous='auto',
                        reduce_bias=True, same_slope=False):
    if isinstance(A, Tensor):
        if contiguous is True or contiguous == 'auto':
            # For dense mode, convert d_pos and d_neg to contiguous tensor by default.
            d_pos = d_pos.contiguous()
            d_neg = d_neg.contiguous()
        if d_pos.ndim == 1:
            # Special case for LSTM, the bias term is 1-dimension. (FIXME)
            assert d_neg.ndim == 1 and b_pos.ndim == 1 and b_neg.ndim == 1
            new_A = A.clamp(min=0) * d_pos + A.clamp(max=0) * d_neg
            new_bias = A.clamp(min=0) * b_pos + A.clamp(max=0) * b_neg
            return new_A, new_bias
        return ClampedMultiplication.apply(
            A, d_pos, d_neg, b_pos, b_neg, False, reduce_bias, same_slope)
    elif isinstance(A, Patches):
        if contiguous:
            # For patches mode, do not convert d_pos and d_neg to contiguous tensor by default.
            d_pos = d_pos.contiguous()
            d_neg = d_neg.contiguous()
        assert A.identity == 0  # TODO: handle the A.identity = 1 case. Currently not used.
        patches = A.patches
        patches_shape = patches.shape
        # patches shape: [out_c, batch_size, out_h, out_w, in_c, H, W]. Here out_c is the spec dimension.
        # or (unstable_size, batch_size, in_c, H, W) when it is sparse.
        if len(patches_shape) == 6:
            patches = patches.view(*patches_shape[:2], -1, *patches_shape[-2:])
            d_pos = d_pos.view(*patches_shape[:2], -1, *patches_shape[-2:]) if d_pos is not None else None
            d_neg = d_neg.view(*patches_shape[:2], -1, *patches_shape[-2:]) if d_neg is not None else None
            b_pos = b_pos.view(*patches_shape[:2], -1, *patches_shape[-2:]) if b_pos is not None else None
            b_neg = b_neg.view(*patches_shape[:2], -1, *patches_shape[-2:]) if b_neg is not None else None
        # Apply the multiplication based on signs.
        A_prod, bias = ClampedMultiplication.apply(
            patches, d_pos, d_neg, b_pos, b_neg, True, reduce_bias, same_slope)
        # prod has shape [out_c, batch_size, out_h, out_w, in_c, H, W] or (unstable_size, batch_size, in_c, H, W) when it is sparse.
        # For sparse patches the return bias size is (unstable_size, batch).
        # For regular patches the return bias size is (spec, batch, out_h, out_w).
        if len(patches_shape) == 6:
            A_prod = A_prod.view(*patches_shape)
        return A.create_similar(A_prod), bias


================================================
FILE: auto_LiRPA/operators/constant.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
""" Constant operators, including operators that are usually fixed nodes and not perturbed """
from .base import *


class BoundConstant(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.value = attr['value'].to(self.device)
        self.use_default_ibp = True
        self.no_jacobian = True

    def __repr__(self):
        if self.value.numel() == 1:
            return f'BoundConstant(name={self.name}, value={self.value})'
        else:
            return super().__repr__()

    def forward(self):
        return self.value.to(self.device)

    def bound_backward(self, last_lA, last_uA, **kwargs):
        def _bound_oneside(A):
            if A is None:
                return 0.0

            if type(A) == Tensor:
                if A.ndim > 2:
                    A = torch.sum(A, dim=list(range(2, A.ndim)))
            elif type(A) == Patches:
                assert A.padding == 0 or A.padding == (0, 0, 0, 0) or self.value == 0  # FIXME (09/19): adding padding here.
                patches_reshape = torch.sum(A.patches, dim=(-1, -2, -3)) * self.value.to(self.device)
                # Expected shape for bias is (spec, batch, out_h, out_w) or (unstable_size, batch)
                return patches_reshape

            return A * self.value.to(self.device)

        lbias = _bound_oneside(last_lA)
        ubias = _bound_oneside(last_uA)
        return [], lbias, ubias

    def bound_forward(self, dim_in):
        lw = uw = torch.zeros(dim_in, device=self.device)
        lb = ub = self.value
        return LinearBound(lw, lb, uw, ub)

    def build_solver(self, *v, model, C=None, model_type="mip", solver_pkg="gurobi"):
        self.solver_vars = self.value


class BoundPrimConstant(Bound):
    def forward(self):
        return torch.tensor([], device=self.device)


class BoundConstantOfShape(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.value = attr['value'].to(self.device)
        self.no_jacobian = True

    def forward(self, x):
        self.x = x
        self.from_input = True
        return self.value.expand(*list(x))

    def bound_backward(self, last_lA, last_uA, x, **kwargs):
        if last_lA is not None:
            lower_sum_b = last_lA * self.value
            while lower_sum_b.ndim > 2:
                lower_sum_b = torch.sum(lower_sum_b, dim=-1)
        else:
            lower_sum_b = 0

        if last_uA is not None:
            upper_sum_b = last_uA * self.value
            while upper_sum_b.ndim > 2:
                upper_sum_b = torch.sum(upper_sum_b, dim=-1)
        else:
            upper_sum_b = 0

        return [(None, None)], lower_sum_b, upper_sum_b

    def bound_forward(self, dim_in, x):
        assert (len(self.x) >= 1)
        lb = ub = torch.ones(self.output_shape, device=self.device) * self.value
        lw = uw = torch.zeros(self.x[0], dim_in, *self.x[1:], device=self.device)
        return LinearBound(lw, lb, uw, ub)

    def interval_propagate(self, *v):
        self.x = v[0][0]
        value = torch.ones(tuple(v[0][0]), device=self.device) * self.value
        return value, value

    def build_solver(self, *v, model, C=None, model_type="mip", solver_pkg="gurobi"):
        self.solver_vars = self.forward(v)


class BoundRange(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.device = attr['device']

    def forward(self, start, end, step):
        if start.dtype == end.dtype == step.dtype == torch.int64:
            return torch.arange(start, end, step, dtype=torch.int64, device=self.device)
        else:
            return torch.arange(start, end, step, device=self.device)


class BoundATenDiag(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.device = attr['device']

    def forward(self, x, diagonal=0):
        return torch.diag(x, diagonal=diagonal)

    def interval_propagate(self, *v):
        return Interval.make_interval(torch.diag(v[0][0], v[1][0]), torch.diag(v[0][1], v[1][0]), v[0])


class BoundATenDiagonal(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.device = attr['device']

    def forward(self, x, offset=0, dim1=0, dim2=1):
        return torch.diagonal(x, offset=offset, dim1=dim1, dim2=dim2)

    def interval_propagate(self, *v):
        params = (v[1][0], v[2][0], v[3][0])
        return Interval.make_interval(torch.diagonal(v[0][0], *params), torch.diagonal(v[0][1], *params), v[0])

    def bound_backward(self, last_lA, last_uA, *args, **kwargs):
        for i in range(1, 4):
            assert isinstance(self.inputs[i], BoundConstant)

        def _bound_oneside(last_A):
            if last_A is None:
                return None
            A = torch.zeros(*last_A.shape[:2], *self.inputs[0].output_shape[1:]).to(last_A)
            dim1, dim2 = self.inputs[2].value, self.inputs[3].value
            assert dim1 != 0 and dim2 != 0
            if dim1 > 0:
                dim1 += 1
            if dim2 > 0:
                dim2 += 1
            A = torch.diagonal_scatter(
                A, last_A,
                offset=self.inputs[1].value, dim1=dim1, dim2=dim2)
            return A

        return ([(_bound_oneside(last_lA), _bound_oneside(last_uA))]
                + [(None, None)] * 3), 0, 0


================================================
FILE: auto_LiRPA/operators/convex_concave.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
"""Nonlinear functions that are either convex or convave within the entire domain."""
import torch
from torch.nn import Module
from .base import *
from .activation_base import BoundActivation, BoundOptimizableActivation


class BoundLog(BoundActivation):

    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.range_l = 1e-6

    def forward(self, x):
        # NOTE adhoc implementation for loss fusion
        if self.loss_fusion:
            return torch.logsumexp(self.inputs[0].inputs[0].inputs[0].forward_value, dim=-1)
        return torch.log(x.clamp(min=epsilon))

    def bound_relax(self, x, init=False):
        if init:
            self.init_linear_relaxation(x)
        rl, ru = self.forward(x.lower), self.forward(x.upper)
        ku = (ru - rl) / (x.upper - x.lower + epsilon)
        self.add_linear_relaxation(mask=None, type='lower', k=ku, x0=x.lower, y0=rl)
        m = (x.lower + x.upper) / 2
        k = torch.reciprocal(m)
        rm = self.forward(m)
        self.add_linear_relaxation(mask=None, type='upper', k=k, x0=m, y0=rm)

    def interval_propagate(self, *v):
        # NOTE adhoc implementation for loss fusion
        if self.loss_fusion:
            par = self.inputs[0].inputs[0].inputs[0]
            lower = torch.logsumexp(par.lower, dim=-1)
            upper = torch.logsumexp(par.upper, dim=-1)
            return lower, upper
        return super().interval_propagate(*v)

    def bound_backward(self, last_lA, last_uA, x, **kwargs):
        A, lbias, ubias = super().bound_backward(last_lA, last_uA, x)
        # NOTE adhoc implementation for loss fusion
        if self.loss_fusion:
            assert A[0][0] is None
            exp_module = self.inputs[0].inputs[0]
            ubias = ubias + self.get_bias(A[0][1], exp_module.max_input.squeeze(-1))
        return A, lbias, ubias


class BoundSqrt(BoundOptimizableActivation):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.use_prior_constraint = True
        self.has_constraint = True
        self.range_l = 1e-6

    def forward(self, x):
        return torch.sqrt(x)

    def bound_relax(self, x, init=False, dim_opt=None):
        if init:
            self.init_linear_relaxation(x, dim_opt)

        if self.opt_stage in ['opt', 'reuse']:
            self.alpha[self._start].data[:2] = torch.min(torch.max(
                self.alpha[self._start].data[:2], x.lower), x.upper)
            mid = self.alpha[self._start]
        else:
            mid = (x.lower + x.upper) / 2
        k = 0.5 / self.forward(mid)
        self.add_linear_relaxation(mask=None, type='upper', k=k, x0=mid)

        sqrt_l = self.forward(x.lower)
        sqrt_u = self.forward(x.upper)
        k = (sqrt_u - sqrt_l) / (x.upper - x.lower).clamp(min=1e-8)
        self.add_linear_relaxation(mask=None, type='lower', k=k, x0=x.lower)

    def bound_backward(self, last_lA, last_uA, x, **kwargs):
        if self.use_prior_constraint and self.check_constraint_available(x):
            if hasattr(x, 'cstr_interval'):
                del x.cstr_interval
                del x.cstr_lower
                del x.cstr_upper

            x_l, x_u = self._ibp_constraint(x, delete_bounds_after_use=True)
            x_u = torch.max(x_u, x_l + 1e-8)
        return super().bound_backward(last_lA, last_uA, x, **kwargs)

    def clamp_interim_bounds(self):
        self.cstr_lower = self.lower.clamp(min=0)
        self.cstr_upper = self.upper.clamp(min=0)
        self.cstr_interval = (self.cstr_lower, self.cstr_upper)

    def _init_opt_parameters_impl(self, size_spec, **kwargs):
        """Implementation of init_opt_parameters for each start_node."""
        l, u = self.inputs[0].lower, self.inputs[0].upper
        alpha = torch.empty(2, size_spec, *l.shape, device=l.device)
        alpha.data[:2] = (l + u) / 2
        return alpha


class BoundReciprocal(BoundOptimizableActivation):

    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.splittable = True
        self.range_l = 1e-6

    def forward(self, x):
        return torch.reciprocal(x)

    def interval_propagate(self, *v):
        h_L = v[0][0].to(dtype=torch.get_default_dtype())
        h_U = v[0][1].to(dtype=torch.get_default_dtype())
        assert h_L.min() > 0, 'Only positive values are supported in BoundReciprocal'
        return torch.reciprocal(h_U), torch.reciprocal(h_L)

    def bound_relax(self, x, init=False, dim_opt=None):
        if init:
            self.init_linear_relaxation(x, dim_opt)

        assert x.lower.min() >= 0

        ku = -1. / (x.lower * x.upper)
        self.add_linear_relaxation(mask=None, type='upper', k=ku, x0=x.lower)

        if self.opt_stage in ['opt', 'reuse']:
            self.alpha[self._start].data[:2] = torch.min(torch.max(
                self.alpha[self._start].data[:2], x.lower), x.upper)
            mid = self.alpha[self._start].clamp(min=0.01)
        else:
            mid = (x.lower + x.upper) / 2

        self.add_linear_relaxation(
            mask=None, type='lower', k=-1./(mid**2), x0=mid)

        if x.lower.min() <= 0:
            mask = x.lower == 0
            self.uw[..., mask] = 0
            self.ub[..., mask] = torch.inf
        if x.upper.isinf().any():
            mask = x.upper.isinf()
            self.lw[..., mask] = 0
            self.lb[..., mask] = 0

    def bound_backward(self, last_lA, last_uA, x, **kwargs):
        As, lbias, ubias = super().bound_backward(last_lA, last_uA, x, **kwargs)
        if isinstance(ubias, torch.Tensor) and ubias.isnan().any():
            ubias[ubias.isnan()] = torch.inf if (last_uA != 0).any() else 0.
        if isinstance(lbias, torch.Tensor) and lbias.isnan().any():
            lbias[lbias.isnan()] = 0.
        return As, lbias, ubias

    def _init_opt_parameters_impl(self, size_spec, **kwargs):
        """Implementation of init_opt_parameters for each start_node."""
        l, u = self.inputs[0].lower, self.inputs[0].upper
        alpha = torch.empty(2, size_spec, *l.shape, device=l.device)
        alpha.data[:2] = (l + u) / 2
        return alpha

    def build_gradient_node(self, grad_upstream):
        return [(ReciprocalGrad(), (grad_upstream, self.inputs[0].forward_value), [self.inputs[0]])]


class ReciprocalGrad(Module):
    def __init__(self):
        super().__init__()

    def forward(self, g, x):
        # partial derivative of 1/x is -1/x^2
        return -g / torch.square(x).unsqueeze(1)


class BoundExp(BoundOptimizableActivation):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        if options is None:
            options = {}
        self.options = options.get('exp', {})
        self.max_input = 0

    def forward(self, x):
        if self.loss_fusion and self.options != 'no-max-input':
            self.max_input = torch.max(x, dim=-1, keepdim=True)[0].detach()
            return torch.exp(x - self.max_input)
        return torch.exp(x)

    def interval_propagate(self, *v):
        assert len(v) == 1
        # unary monotonous functions only
        h_L, h_U = v[0]
        if self.loss_fusion and self.options != 'no-max-input':
            self.max_input = torch.max(h_U, dim=-1, keepdim=True)[0]
            h_L, h_U = h_L - self.max_input, h_U - self.max_input
        else:
            self.max_input = 0
        return torch.exp(h_L), torch.exp(h_U)

    def bound_forward(self, dim_in, x):
        m = torch.min((x.lower + x.upper) / 2, x.lower + 0.99)

        exp_l, exp_m, exp_u = torch.exp(x.lower), torch.exp(m), torch.exp(x.upper)

        kl = exp_m
        lw = x.lw * kl.unsqueeze(1)
        lb = kl * (x.lb - m + 1)

        ku = (exp_u - exp_l) / (x.upper - x.lower + epsilon)
        uw = x.uw * ku.unsqueeze(1)
        ub = x.ub * ku - ku * x.lower + exp_l

        return LinearBound(lw, lb, uw, ub)

    def bound_backward(self, last_lA, last_uA, x, **kwargs):
        # Special case when computing log_softmax (FIXME: find a better solution, this trigger condition is not reliable).
        if self.loss_fusion and last_lA is None and last_uA is not None and torch.min(
                last_uA) >= 0 and x.from_input:
            # Adding an extra bias term to the input. This is equivalent to adding a constant and subtract layer before exp.
            # Note that we also need to adjust the bias term at the end.
            if self.options == 'no-detach':
                self.max_input = torch.max(x.upper, dim=-1, keepdim=True)[0]
            elif self.options != 'no-max-input':
                self.max_input = torch.max(x.upper, dim=-1, keepdim=True)[0].detach()
            else:
                self.max_input = 0
            adjusted_lower = x.lower - self.max_input
            adjusted_upper = x.upper - self.max_input
            # relaxation for upper bound only (used in loss fusion)
            exp_l, exp_u = torch.exp(adjusted_lower), torch.exp(adjusted_upper)
            k = (exp_u - exp_l) / (adjusted_upper - adjusted_lower).clamp(min=1e-8)
            if k.requires_grad:
                k = k.clamp(min=1e-8)
            uA = last_uA * k.unsqueeze(0)
            ubias = last_uA * (-adjusted_lower * k + exp_l).unsqueeze(0)

            if ubias.ndim > 2:
                ubias = torch.sum(ubias, dim=tuple(range(2, ubias.ndim)))
            # Also adjust the missing ubias term.
            if uA.ndim > self.max_input.ndim:
                A = torch.sum(uA, dim=tuple(range(self.max_input.ndim, uA.ndim)))
            else:
                A = uA

            # These should hold true in loss fusion
            assert self.batch_dim == 0
            assert A.shape[0] == 1

            batch_size = A.shape[1]
            ubias -= (A.reshape(batch_size, -1) * self.max_input.reshape(batch_size, -1)).sum(dim=-1).unsqueeze(0)
            return [(None, uA)], 0, ubias
        else:
            As, lbias, ubias = super().bound_backward(last_lA, last_uA, x, **kwargs)
            lA, uA = As[0]
            lA, lbias = self._check_nan(lA, lbias, last_lA, 0)
            uA, ubias = self._check_nan(uA, ubias, last_uA, torch.inf)
            return [(lA, uA)], lbias, ubias

    def _check_nan(self, A, bias, last_A, const_bound):
        """Check for NaN caused by 0 in A and inf in lw/lb/uw/ub.
        It can happen if the pre-activation bounds are very loose for exp.
        """
        if A is None:
            return A, bias
        if bias.isnan().any():
            # These assertions ensure that 0 is in A and inf is in lw/lb/uw/ub
            assert not last_A.isnan().any()
            assert not last_A.isinf().any()
            assert not self.lw.isnan().any()
            assert not self.uw.isnan().any()
            assert not self.lb.isnan().any()
            assert not self.ub.isnan().any()
            A_ = A.view(-1, *A.shape[2:]).clone()
            bias_ = bias.view(-1).clone()
            mask = bias_.isnan()
            A_[mask] = 0
            assert (last_A >= 0).all()
            bias_[mask] = const_bound if (last_A != 0).any() else 0.
            A = A_.view(A.shape)
            bias = bias_.view(bias.shape)
        return A, bias

    def bound_relax(self, x, init=False, dim_opt=None):
        if init:
            self.init_linear_relaxation(x, dim_opt)
        min_val = -1e9
        l, u = x.lower.clamp(min=min_val), x.upper.clamp(min=min_val)
        if self.opt_stage in ['opt', 'reuse']:
            self.alpha[self._start].data[:2] = torch.min(torch.max(
                self.alpha[self._start].data[:2], x.lower), x.upper)
            m = torch.min(self.alpha[self._start], x.lower + 0.99)
        else:
            m = torch.min((x.lower + x.upper) / 2, x.lower + 0.99)
        exp_l, exp_m, exp_u = torch.exp(x.lower), torch.exp(m), torch.exp(x.upper)
        k = exp_m
        self.add_linear_relaxation(mask=None, type='lower', k=k, x0=m, y0=exp_m)
        k = (exp_u - exp_l) / (u - l).clamp(min=1e-8)
        self.add_linear_relaxation(mask=None, type='upper', k=k, x0=l, y0=exp_l)

    def _init_opt_parameters_impl(self, size_spec, **kwargs):
        """Implementation of init_opt_parameters for each start_node."""
        l, u = self.inputs[0].lower, self.inputs[0].upper
        alpha = torch.empty(2, size_spec, *l.shape, device=l.device)
        alpha.data[:2] = (l + u) / 2
        return alpha

    def build_gradient_node(self, grad_upstream):
        if self.loss_fusion:
            raise NotImplementedError('Gradient computation for exp with loss fusion is not supported.')
        return [(ExpGrad(), (grad_upstream, self.inputs[0].forward_value), [self.inputs[0]])]


class ExpGrad(Module):
    def __init__(self):
        super().__init__()
    
    def forward(self, g, preact):
        # exp'(x) = exp(x)
        return g * torch.exp(preact).unsqueeze(1)


================================================
FILE: auto_LiRPA/operators/convolution.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
""" Convolution and padding operators"""
from torch.autograd import Function
from torch.nn import Module
from .base import *
import numpy as np
from .solver_utils import grb
from ..patches import unify_shape, compute_patches_stride_padding, is_shape_used, create_valid_mask

EPS = 1e-2

class BoundConv(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)

        if len(attr['kernel_shape']) == 1:
            # for 1d conv
            assert (attr['pads'][0] == attr['pads'][1])
            self.padding = [attr['pads'][0]]
            self.F_conv = F.conv1d
            self.conv_dim = 1
        else:
            # for 2d conv
            assert (attr['pads'][0] == attr['pads'][2])
            assert (attr['pads'][1] == attr['pads'][3])
            self.padding = [attr['pads'][0], attr['pads'][1]]
            self.F_conv = F.conv2d
            self.conv_dim = 2

        self.stride = attr['strides']
        self.dilation = attr['dilations']
        self.groups = attr['group']
        if len(inputs) == 3:
            self.has_bias = True
        else:
            self.has_bias = False
        self.patches_start = True
        if options is None:
            options = {}
        self.mode = options.get("conv_mode", "matrix")
        # denote whether this Conv is followed by a ReLU
        # if self.relu_followed is False, we need to manually pad the conv patches.
        # If self.relu_followed is True, the patches are padded in the ReLU layer
        # and the manual padding is not needed.
        self.relu_followed = False

    def forward(self, *x):
        # x[0]: input, x[1]: weight, x[2]: bias if self.has_bias
        bias = x[2] if self.has_bias else None

        output = self.F_conv(x[0], x[1], bias, self.stride, self.padding, self.dilation, self.groups)

        return output

    def bound_backward(self, last_lA, last_uA, *x, **kwargs):
        if self.is_input_perturbed(1):
            raise NotImplementedError(
                'Weight perturbation for convolution layers has not been implmented.')

        lA_y = uA_y = lA_bias = uA_bias = None
        weight = x[1].lower

        def _bound_oneside(last_A):
            if last_A is None:
                return None, 0
            if type(last_A) is OneHotC:
                # Conv layer does not support the OneHotC fast path. We have to create a dense matrix instead.
                last_A = onehotc_to_dense(last_A, dtype=weight.dtype)

            if type(last_A) == Tensor:
                shape = last_A.size()
                # when (W−F+2P)%S != 0, construct the output_padding
                if self.conv_dim == 2:
                    output_padding0 = (
                        int(self.input_shape[2]) - (int(self.output_shape[2]) - 1) * self.stride[0] + 2 *
                        self.padding[0] - 1 - (int(weight.size()[2] - 1) * self.dilation[0]))
                    output_padding1 = (
                        int(self.input_shape[3]) - (int(self.output_shape[3]) - 1) * self.stride[1] + 2 *
                        self.padding[1] - 1 - (int(weight.size()[3] - 1) * self.dilation[1]))
                    next_A = F.conv_transpose2d(
                        last_A.reshape(shape[0] * shape[1], *shape[2:]), weight, None,
                        stride=self.stride, padding=self.padding, dilation=self.dilation,
                        groups=self.groups, output_padding=(output_padding0, output_padding1))
                else:
                    # for 1d conv, we use conv_transpose1d()
                    output_padding = (
                            int(self.input_shape[2]) - (int(self.output_shape[2]) - 1) * self.stride[0] + 2 *
                            self.padding[0] - 1 - (int(weight.size()[2] - 1) * self.dilation[0]))
                    next_A = F.conv_transpose1d(
                        last_A.reshape(shape[0] * shape[1], *shape[2:]), weight, None,
                        stride=self.stride, padding=self.padding, dilation=self.dilation,
                        groups=self.groups, output_padding=output_padding)

                next_A = next_A.view(shape[0], shape[1], *next_A.shape[1:])
                if self.has_bias:
                    # sum_bias = (last_A.sum((3, 4)) * x[2].lower).sum(2)
                    sum_bias = torch.einsum('sbc...,c->sb', last_A, x[2].lower)
                else:
                    sum_bias = 0
                return next_A, sum_bias
            elif type(last_A) == Patches:
                # Here we build and propagate a Patch object with (patches, stride, padding)
                assert self.conv_dim == 2, 'Patches mode not supports conv1d so far.'
                assert type(last_A) == Patches
                if last_A.identity == 0:
                    # FIXME (09/20): Don't call it relu_followed. Instead, make this a property of A, called "padded" and propagate this property.
                    if not self.relu_followed:
                        patches = last_A.create_padding(self.output_shape)
                    else:
                        patches = last_A.patches

                    if self.has_bias:
                        # bias is x[2] (lower and upper are the same), and has shape (c,).
                        # Patches either has [out_c, batch, out_h, out_w, c, h, w] or [unstable_size, batch, c, h, w].
                        sum_bias = torch.einsum('sb...chw,c->sb...', patches, x[2].lower)
                        # sum_bias has shape (out_c, batch, out_h, out_w) or (unstable_size, batch).
                    else:
                        sum_bias = 0

                    flattened_patches = patches.reshape(
                        -1, patches.size(-3), patches.size(-2), patches.size(-1))
                    pieces = F.conv_transpose2d(
                        flattened_patches, insert_zeros(weight, last_A.inserted_zeros)
                        , stride=self.stride)
                    # New patch size: (out_c, batch, out_h, out_w, c, h, w) or (unstable_size, batch, c, h, w).
                    pieces = pieces.view(
                        *patches.shape[:-3], pieces.size(-3), pieces.size(-2),
                        pieces.size(-1))

                elif last_A.identity == 1:
                    # New patches have size [out_c, batch, out_h, out_w, c, h, w] if it is not sparse.
                    # New patches have size [unstable_size, batch, c, h, w] if it is sparse.
                    if last_A.unstable_idx is not None:
                        pieces = weight.view(
                            weight.size(0), 1, weight.size(1), weight.size(2),
                            weight.size(3))
                        # Select based on the output channel (out_h and out_w are irrelevant here).
                        pieces = pieces[last_A.unstable_idx[0]]
                        # Expand the batch dimnension.
                        pieces = pieces.expand(-1, last_A.shape[1], -1, -1, -1)
                        # Do the same for the bias.
                        if self.has_bias:
                            sum_bias = x[2].lower[last_A.unstable_idx[0]].unsqueeze(-1)
                            # bias has shape (unstable_size, batch).
                            sum_bias = sum_bias.expand(-1, last_A.shape[1])
                        else:
                            sum_bias = 0
                    else:
                        assert weight.size(0) == last_A.shape[0]
                        pieces = weight.view(
                            weight.size(0), 1, 1, 1, weight.size(1), weight.size(2),
                            weight.size(3)).expand(-1, *last_A.shape[1:4], -1, -1, -1)
                        # The bias (x[2].lower) has shape (out_c,) need to make it (out_c, batch, out_h, out_w).
                        # Here we should transpose sum_bias to set the batch dim to 1, aiming to keep it consistent with the matrix version
                        if self.has_bias:
                            sum_bias = x[2].lower.view(-1, 1, 1, 1).expand(-1, *last_A.shape[1:4])
                        else:
                            sum_bias = 0
                else:
                    raise NotImplementedError()
                padding = last_A.padding if last_A is not None else (0, 0, 0, 0)  # (left, right, top, bottom)
                stride = last_A.stride if last_A is not None else (1, 1)
                inserted_zeros = last_A.inserted_zeros if last_A is not None else 0
                output_padding = last_A.output_padding if last_A is not None else (0, 0, 0, 0)

                padding, stride, output_padding = compute_patches_stride_padding(
                    self.input_shape, padding, stride, self.padding, self.stride,
                    inserted_zeros, output_padding)

                if (inserted_zeros == 0 and not is_shape_used(output_padding)
                    and pieces.shape[-1] > self.input_shape[-1]):  # the patches is too large and from now on, we will use matrix mode instead of patches mode.
                    # This is our desired matrix: the input will be flattend to (batch_size, input_channel*input_x * input_y) and multiplies on this matrix.
                    # After multiplication, the desired output is (batch_size, out_channel*output_x*output_y).
                    # A_matrix has size (batch, out_c*out_h*out_w, in_c*in_h*in_w)
                    A_matrix = patches_to_matrix(
                        pieces, self.input_shape[1:], stride, padding,
                        last_A.output_shape, last_A.unstable_idx)
                    # print(f'Converting patches to matrix: old shape {pieces.shape}, size {pieces.numel()}; new shape {A_matrix.shape}, size {A_matrix.numel()}')
                    if isinstance(sum_bias, Tensor) and last_A.unstable_idx is None:
                        sum_bias = sum_bias.transpose(0, 1)
                        sum_bias = sum_bias.reshape(sum_bias.size(0), -1).transpose(0,1)
                    A_matrix = A_matrix.transpose(0,1)  # Spec dimension at the front.
                    return A_matrix, sum_bias
                new_patches = last_A.create_similar(
                        pieces, stride=stride, padding=padding, output_padding=output_padding,
                        identity=0, input_shape=self.input_shape)
                # if last_A is last_lA:
                #     print(f'Conv : start_node {kwargs["start_node"].name} layer {self.name} {new_patches}')
                return new_patches, sum_bias
            else:
                raise NotImplementedError()

        lA_x, lbias = _bound_oneside(last_lA)
        uA_x, ubias = _bound_oneside(last_uA)
        return [(lA_x, uA_x), (lA_y, uA_y), (lA_bias, uA_bias)], lbias, ubias

    def build_solver(self, *v, model, C=None, model_type="mip", solver_pkg="gurobi"):
        if self.is_input_perturbed(1):
            raise NotImplementedError("Weight perturbation for convolution layers has not been implmented.")

        assert self.dilation == (1, 1) or self.dilation == [1, 1]
        # e.g., last layer input gurobi vars (3,32,32)
        gvars_array = np.array(v[0])
        # pre_layer_shape (1,3,32,32)
        pre_layer_shape = np.expand_dims(gvars_array, axis=0).shape
        # this layer shape (1,8,16,16)
        this_layer_shape = self.output_shape
        out_lbs, out_ubs = None, None
        if self.is_lower_bound_current():
            # self.lower shape (1,8,16,16)
            out_lbs = self.lower.detach().cpu().numpy()
            out_ubs = self.upper.detach().cpu().numpy()

        # current layer weight (8,3,4,4)
        this_layer_weight = v[1].detach().cpu().numpy()
        # current layer bias (8,)
        this_layer_bias = None
        if self.has_bias:
            this_layer_bias = v[2].detach().cpu().numpy()
        weight_shape2, weight_shape3 = this_layer_weight.shape[2], this_layer_weight.shape[3]
        padding0, padding1 = self.padding[0], self.padding[1]
        stride0, stride1 = self.stride[0], self.stride[1]

        new_layer_gurobi_vars = []
        new_layer_gurobi_constrs = []

        # precompute row and column index mappings

        # compute row mapping: from current row to input rows
        # vectorization of following code:
        # for out_row_idx in range(this_layer_shape[2]):
        #     ker_row_min, ker_row_max = 0, weight_shape2
        #     in_row_idx_min = -padding0 + stride0 * out_row_idx
        #     in_row_idx_max = in_row_idx_min + weight_shape2 - 1
        #     if in_row_idx_min < 0:
        #         ker_row_min = -in_row_idx_min
        #     if in_row_idx_max >= pre_layer_shape[2]:
        #         ker_row_max = ker_row_max - in_row_idx_max + pre_layer_shape[2] - 1
        #     in_row_idx_min, in_row_idx_max = max(in_row_idx_min, 0), min(in_row_idx_max,
        #                                                                  pre_layer_shape[2] - 1)
        in_row_idx_mins = np.arange(this_layer_shape[2]) * stride0 - padding0
        in_row_idx_maxs = in_row_idx_mins + weight_shape2 - 1
        ker_row_mins = np.zeros(this_layer_shape[2], dtype=int)
        ker_row_maxs = np.ones(this_layer_shape[2], dtype=int) * weight_shape2
        ker_row_mins[in_row_idx_mins < 0] = -in_row_idx_mins[in_row_idx_mins < 0]
        ker_row_maxs[in_row_idx_maxs >= pre_layer_shape[2]] = \
            ker_row_maxs[in_row_idx_maxs >= pre_layer_shape[2]] - in_row_idx_maxs[in_row_idx_maxs >= pre_layer_shape[2]]\
            + pre_layer_shape[2] - 1
        in_row_idx_mins = np.maximum(in_row_idx_mins, 0)
        in_row_idx_maxs = np.minimum(in_row_idx_maxs, pre_layer_shape[2] - 1)

        # compute column mapping: from current column to input columns
        # vectorization of following code:
        # for out_col_idx in range(this_layer_shape[3]):
        #     ker_col_min, ker_col_max = 0, weight_shape3
        #     in_col_idx_min = -padding1 + stride1 * out_col_idx
        #     in_col_idx_max = in_col_idx_min + weight_shape3 - 1
        #     if in_col_idx_min < 0:
        #         ker_col_min = -in_col_idx_min
        #     if in_col_idx_max >= pre_layer_shape[3]:
        #         ker_col_max = ker_col_max - in_col_idx_max + pre_layer_shape[3] - 1
        #     in_col_idx_min, in_col_idx_max = max(in_col_idx_min, 0), min(in_col_idx_max,
        #                                                                  pre_layer_shape[3] - 1)
        in_col_idx_mins = np.arange(this_layer_shape[3]) * stride1 - padding1
        in_col_idx_maxs = in_col_idx_mins + weight_shape3 - 1
        ker_col_mins = np.zeros(this_layer_shape[3], dtype=int)
        ker_col_maxs = np.ones(this_layer_shape[3], dtype=int) * weight_shape3
        ker_col_mins[in_col_idx_mins < 0] = -in_col_idx_mins[in_col_idx_mins < 0]
        ker_col_maxs[in_col_idx_maxs >= pre_layer_shape[3]] = \
            ker_col_maxs[in_col_idx_maxs >= pre_layer_shape[3]] - in_col_idx_maxs[in_col_idx_maxs >= pre_layer_shape[3]]\
            + pre_layer_shape[3] - 1
        in_col_idx_mins = np.maximum(in_col_idx_mins, 0)
        in_col_idx_maxs = np.minimum(in_col_idx_maxs, pre_layer_shape[3] - 1)

        neuron_idx = 0
        for out_chan_idx in range(this_layer_shape[1]):
            out_chan_vars = []
            for out_row_idx in range(this_layer_shape[2]):
                out_row_vars = []

                # get row index range from precomputed arrays
                ker_row_min, ker_row_max = ker_row_mins[out_row_idx], ker_row_maxs[out_row_idx]
                in_row_idx_min, in_row_idx_max = in_row_idx_mins[out_row_idx], in_row_idx_maxs[out_row_idx]

                for out_col_idx in range(this_layer_shape[3]):

                    # get col index range from precomputed arrays
                    ker_col_min, ker_col_max = ker_col_mins[out_col_idx], ker_col_maxs[out_col_idx]
                    in_col_idx_min, in_col_idx_max = in_col_idx_mins[out_col_idx], in_col_idx_maxs[out_col_idx]

                    # init linear expression
                    lin_expr = this_layer_bias[out_chan_idx] if self.has_bias else 0

                    # init linear constraint LHS implied by the conv operation
                    for in_chan_idx in range(this_layer_weight.shape[1]):

                        coeffs = this_layer_weight[out_chan_idx, in_chan_idx, ker_row_min:ker_row_max, ker_col_min:ker_col_max].reshape(-1)
                        gvars = gvars_array[in_chan_idx, in_row_idx_min:in_row_idx_max+1, in_col_idx_min:in_col_idx_max+1].reshape(-1)
                        if solver_pkg == 'gurobi':
                            lin_expr += grb.LinExpr(coeffs, gvars)
                        else:
                            for i in range(len(coeffs)):
                                try:
                                    lin_expr += coeffs[i] * gvars[i]
                                except TypeError:
                                    lin_expr += coeffs[i] * gvars[i].var

                    # init potential lb and ub, which helps solver to finish faster
                    out_lb = out_lbs[0, out_chan_idx, out_row_idx, out_col_idx] if out_lbs is not None else -float('inf')
                    out_ub = out_ubs[0, out_chan_idx, out_row_idx, out_col_idx] if out_ubs is not None else float('inf')
                    if out_ub - out_lb < EPS:
                        # If the inferred lb and ub are too close, it could lead to floating point disagreement
                        # between solver's inferred lb and ub constraints and the computed ones from ab-crown.
                        # Such disagreement can lead to "infeasible" result from the solver for feasible problem.
                        # To avoid so, we relax the box constraints.
                        # This should not affect the solver's result correctness,
                        # since the tighter lb and ub can be inferred by the solver.
                        out_lb, out_ub = (out_lb + out_ub - EPS) / 2., (out_lb + out_ub + EPS) / 2.

                    # add the output var and constraint
                    var = model.addVar(lb=out_lb, ub=out_ub,
                                            obj=0, vtype=grb.GRB.CONTINUOUS,
                                            name=f'lay{self.name}_{neuron_idx}')
                    model.addConstr(lin_expr == var, name=f'lay{self.name}_{neuron_idx}_eq')
                    neuron_idx += 1

                    out_row_vars.append(var)
                out_chan_vars.append(out_row_vars)
            new_layer_gurobi_vars.append(out_chan_vars)

        self.solver_vars = new_layer_gurobi_vars
        model.update()

    def interval_propagate(self, *v, C=None):
        if self.is_input_perturbed(1):
            raise NotImplementedError("Weight perturbation for convolution layers has not been implmented.")

        norm = Interval.get_perturbation(v[0])
        norm = norm[0]

        h_L, h_U = v[0]
        weight = v[1][0]
        bias = v[2][0] if self.has_bias else None

        if norm == torch.inf:
            mid = (h_U + h_L) / 2.0
            diff = (h_U - h_L) / 2.0
            weight_abs = weight.abs()
            deviation = self.F_conv(diff, weight_abs, None, self.stride, self.padding, self.dilation, self.groups)
        elif norm > 0:
            norm, eps = Interval.get_perturbation(v[0])
            # L2 norm, h_U and h_L are the same.
            mid = h_U
            # TODO: padding
            assert not isinstance(eps, torch.Tensor) or eps.numel() == 1
            deviation = torch.mul(weight, weight).sum((1, 2, 3)).sqrt() * eps
            deviation = deviation.unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
        else: # Here we calculate the L0 norm IBP bound using the bound proposed in [Certified Defenses for Adversarial Patches, ICLR 2020]
            norm, eps, ratio = Interval.get_perturbation(v[0])
            mid = h_U
            k = int(eps)
            weight_sum = torch.sum(weight.abs(), 1)
            deviation = torch.sum(torch.topk(weight_sum.view(weight_sum.shape[0], -1), k)[0], dim=1) * ratio

            if self.has_bias:
                center = self.F_conv(mid, weight, v[2][0], self.stride, self.padding, self.dilation, self.groups)
            else:
                center = self.F_conv(mid, weight, None, self.stride, self.padding, self.dilation, self.groups)

            ss = center.shape
            deviation = deviation.repeat(ss[2] * ss[3]).view(-1, ss[1]).t().view(ss[1], ss[2], ss[3])

        center = self.F_conv(mid, weight, bias, self.stride, self.padding, self.dilation, self.groups)

        upper = center + deviation
        lower = center - deviation
        return lower, upper

    def bound_dynamic_forward(self, *x, max_dim=None, offset=0):
        if self.is_input_perturbed(1) or self.is_input_perturbed(2):
            raise NotImplementedError("Weight perturbation for convolution layers has not been implmented.")
        weight = x[1].lb
        bias = x[2].lb if self.has_bias else None
        x = x[0]
        w = x.lw
        b = x.lb
        shape = w.shape
        shape_wconv = [shape[0] * shape[1]] + list(shape[2:])
        def conv2d(input, weight, bias, stride, padding, dilation, groups):
            """ There may be some CUDA error (illegal memory access) when
            the batch size is too large. Thus split the input into several
            batches when needed. """
            max_batch_size = 50
            if input.device != torch.device('cpu') and input.shape[0] > max_batch_size:
                ret = []
                for i in range((input.shape[0] + max_batch_size - 1) // max_batch_size):
                    ret.append(self.F_conv(
                        input[i*max_batch_size:(i+1)*max_batch_size],
                        weight, bias, stride, padding, dilation, groups))
                return torch.cat(ret, dim=0)
            else:
                return self.F_conv(input, weight, bias, stride, padding, dilation, groups)
        w_new = conv2d(
            w.reshape(shape_wconv), weight, None, self.stride, self.padding,
            self.dilation, self.groups)
        w_new = w_new.reshape(shape[0], -1, *w_new.shape[1:])
        b_new = conv2d(
            b, weight, bias, self.stride, self.padding, self.dilation, self.groups)
        return LinearBound(w_new, b_new, w_new, b_new, x_L=x.x_L, x_U=x.x_U, tot_dim=x.tot_dim)

    def bound_forward(self, dim_in, *x):
        if self.is_input_perturbed(1) or self.is_input_perturbed(2):
            raise NotImplementedError("Weight perturbation for convolution layers has not been implmented.")

        weight = x[1].lb
        bias = x[2].lb if self.has_bias else None
        x = x[0]

        mid_w = (x.lw + x.uw) / 2
        mid_b = (x.lb + x.ub) / 2
        diff_w = (x.uw - x.lw) / 2
        diff_b = (x.ub - x.lb) / 2
        weight_abs = weight.abs()
        shape = mid_w.shape
        shape_wconv = [shape[0] * shape[1]] + list(shape[2:])
        deviation_w = self.F_conv(
            diff_w.reshape(shape_wconv), weight_abs, None,
            self.stride, self.padding, self.dilation, self.groups)
        deviation_b = self.F_conv(
            diff_b, weight_abs, None,
            self.stride, self.padding, self.dilation, self.groups)
        center_w = self.F_conv(
            mid_w.reshape(shape_wconv), weight, None,
            self.stride, self.padding, self.dilation, self.groups)
        center_b = self.F_conv(
            mid_b, weight, bias,
            self.stride, self.padding, self.dilation, self.groups)
        deviation_w = deviation_w.reshape(shape[0], -1, *deviation_w.shape[1:])
        center_w = center_w.reshape(shape[0], -1, *center_w.shape[1:])

        return LinearBound(
            lw = center_w - deviation_w,
            lb = center_b - deviation_b,
            uw = center_w + deviation_w,
            ub = center_b + deviation_b)

    def build_gradient_node(self, grad_upstream):
        node_grad = Conv2dGrad(
            self, self.inputs[1].param, self.stride, self.padding,
            self.dilation, self.groups)
        return [(node_grad, (grad_upstream,), [])]

    def update_requires_input_bounds(self):
        self._check_weight_perturbation()


class BoundConvTranspose(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        assert (attr['pads'][0] == attr['pads'][2])
        assert (attr['pads'][1] == attr['pads'][3])

        self.stride = attr['strides']
        self.padding = [attr['pads'][0], attr['pads'][1]]
        self.dilation = attr['dilations']
        self.groups = attr['group']
        self.output_padding = [attr.get('output_padding', [0, 0])[0], attr.get('output_padding', [0, 0])[1]]
        assert len(attr['kernel_shape']) == 2  # 2d transposed convolution.
        if len(inputs) == 3:
            self.has_bias = True
        else:
            self.has_bias = False
        self.mode = options.get("conv_mode", "matrix")
        assert self.output_padding == [0, 0]
        assert self.dilation == [1, 1]
        assert self.stride[0] == self.stride[1]
        assert self.groups == 1

        self.F_convtranspose = F.conv_transpose2d

    def forward(self, *x):
        # x[0]: input, x[1]: weight, x[2]: bias if self.has_bias
        bias = x[2] if self.has_bias else None
        output = F.conv_transpose2d(x[0], x[1], bias, stride=self.stride, padding=self.padding, dilation=self.dilation, groups=self.groups, output_padding=self.output_padding)
        return output


    def bound_backward(self, last_lA, last_uA, *x, **kwargs):
        if self.is_input_perturbed(1):
            raise NotImplementedError("Weight perturbation for convolution layers has not been implmented.")

        lA_y = uA_y = lA_bias = uA_bias = None
        weight = x[1].lower
        assert weight.size(-1) == weight.size(-2)

        def _bound_oneside(last_A):
            if last_A is None:
                return None, 0
            if type(last_A) is OneHotC:
                # Conv layer does not support the OneHotC fast path. We have to create a dense matrix instead.
                last_A = onehotc_to_dense(last_A, dtype=weight.dtype)

            if type(last_A) == Tensor:
                shape = last_A.size()
                next_A = F.conv2d(last_A.reshape(shape[0] * shape[1], *shape[2:]), weight, None,
                                            stride=self.stride, padding=self.padding, dilation=self.dilation,
                                            groups=self.groups)
                next_A = next_A.view(shape[0], shape[1], *next_A.shape[1:])
                if self.has_bias:
                    sum_bias = (last_A.sum((3, 4)) * x[2].lower).sum(2)
                else:
                    sum_bias = 0
                return next_A, sum_bias
            elif type(last_A) == Patches:
                # Here we build and propagate a Patch object with (patches, stride, padding)
                assert type(last_A) == Patches
                if last_A.identity == 0:
                    patches = last_A.patches

                    # FIXME: so far, assume there will be a relu layer in its input.

                    if self.has_bias:
                        # bias is x[2] (lower and upper are the same), and has shape (c,).
                        # Patches either has [out_c, batch, out_h, out_w, c, h, w] or [unstable_size, batch, c, h, w].
                        sum_bias = torch.einsum('sb...chw,c->sb...', patches, x[2].lower)
                        # sum_bias has shape (out_c, batch, out_h, out_w) or (unstable_size, batch).
                    else:
                        sum_bias = 0

                    flattened_patches = patches.reshape(-1, patches.size(-3), patches.size(-2), patches.size(-1))
                    # Merge patches with this layer's weights. Weight must be flipped here; and if stride != 1, we must insert zeros in the input image.
                    # For conv_transpose2d, the weight matrix is in the (in, out, k, k) shape.
                    # pieces = F.conv_transpose2d(flattened_patches, weight.transpose(0,1).flip(-1,-2), stride=self.stride)
                    # pieces = F.conv_transpose2d(flattened_patches, weight.transpose(0,1).flip(-1,-2), stride=last_A.inserted_zeros + 1)
                    # Use padding in conv_transposed2d directly.
                    pieces = F.conv_transpose2d(
                            # Transpose because the weight has in_channel before out_channel.
                            flattened_patches, insert_zeros(weight.transpose(0,1).flip(-1,-2), last_A.inserted_zeros))
                    # New patch size: (out_c, batch, out_h, out_w, c, h, w) or (unstable_size, batch, c, h, w).
                    pieces = pieces.view(*patches.shape[:-3], pieces.size(-3), pieces.size(-2), pieces.size(-1))

                elif last_A.identity == 1:
                    # New patches have size [out_c, batch, out_h, out_w, c, h, w] if it is not sparse.
                    # New patches have size [unstable_size, batch, c, h, w] if it is sparse.
                    if last_A.unstable_idx is not None:
                        raise NotImplementedError()
                    else:
                        assert weight.size(0) == last_A.shape[0]
                        pieces = weight.view(weight.size(0), 1, 1, 1, weight.size(1), weight.size(2), weight.size(3)).expand(-1, *last_A.shape[1:4], -1, -1, -1)
                        # The bias (x[2].lower) has shape (out_c,) need to make it (out_c, batch, out_h, out_w).
                        # Here we should transpose sum_bias to set the batch dim to 1, aiming to keep it consistent with the matrix version
                        sum_bias = x[2].lower.view(-1, 1, 1, 1).expand(-1, *last_A.shape[1:4])
                else:
                    raise NotImplementedError()
                patches_padding = last_A.padding if last_A is not None else (0, 0, 0, 0)  # (left, right, top, bottom)
                output_padding = last_A.output_padding if last_A is not None else (0, 0, 0, 0)  # (left, right, top, bottom)
                inserted_zeros = last_A.inserted_zeros
                assert self.stride[0] == self.stride[1]

                # Unify the shape to 4-tuple.
                output_padding = unify_shape(output_padding)
                patches_padding = unify_shape(patches_padding)
                this_stride = unify_shape(self.stride)
                this_padding = unify_shape(self.padding)

                # Compute new padding. Due to the shape flip during merging, we need to check the string/size on the dimension 3 - j.
                # TODO: testing for asymmetric shapes.
                padding = tuple(p * (inserted_zeros + 1) + (weight.size(3 - j//2) - 1) for j, p in enumerate(patches_padding))

                # Compute new output padding
                output_padding = tuple(p * (inserted_zeros + 1) + this_padding[j] for j, p in enumerate(output_padding))
                # When we run insert_zeros, it's missing the right most column and the bottom row.
                # padding = (padding[0], padding[1] + inserted_zeros, padding[2], padding[3] + inserted_zeros)

                # If no transposed conv so far, inserted_zero is 0.
                # We a transposed conv is encountered, stride is multiplied on it.
                inserted_zeros = (inserted_zeros + 1) * this_stride[0] - 1

                # FIXME: disabled patches_to_matrix because not all parameters are supported.
                if inserted_zeros == 0 and not is_shape_used(output_padding) and pieces.shape[-1] > self.input_shape[-1]:  # the patches is too large and from now on, we will use matrix mode instead of patches mode.
                    # This is our desired matrix: the input will be flattend to (batch_size, input_channel*input_x * input_y) and multiplies on this matrix.
                    # After multiplication, the desired output is (batch_size, out_channel*output_x*output_y).
                    # A_matrix has size (batch, out_c*out_h*out_w, in_c*in_h*in_w)
                    assert inserted_zeros == 0
                    A_matrix = patches_to_matrix(pieces, self.input_shape[1:], last_A.stride, padding, last_A.output_shape, last_A.unstable_idx)
                    if isinstance(sum_bias, Tensor) and last_A.unstable_idx is None:
                        sum_bias = sum_bias.transpose(0, 1)
                        sum_bias = sum_bias.reshape(sum_bias.size(0), -1).transpose(0,1)
                    A_matrix = A_matrix.transpose(0,1)  # Spec dimension at the front.
                    return A_matrix, sum_bias
                new_patches = last_A.create_similar(
                        pieces, padding=padding, inserted_zeros=inserted_zeros, output_padding=output_padding,
                        input_shape=self.input_shape)
                return new_patches, sum_bias
            else:
                raise NotImplementedError()

        lA_x, lbias = _bound_oneside(last_lA)
        uA_x, ubias = _bound_oneside(last_uA)
        return [(lA_x, uA_x), (lA_y, uA_y), (lA_bias, uA_bias)], lbias, ubias

    def interval_propagate(self, *v, C=None):
        if self.is_input_perturbed(1):
            raise NotImplementedError("Weight perturbation for convolution layers has not been implmented.")

        norm = Interval.get_perturbation(v[0])
        norm = norm[0]

        h_L, h_U = v[0]
        weight = v[1][0]
        bias = v[2][0] if self.has_bias else None

        if norm == torch.inf:
            mid = (h_U + h_L) / 2.0
            diff = (h_U - h_L) / 2.0
            weight_abs = weight.abs()
            deviation = F.conv_transpose2d(diff, weight_abs, None, stride=self.stride, padding=self.padding, dilation=self.dilation, groups=self.groups, output_padding=self.output_padding)
        elif norm > 0:
            raise NotImplementedError()
            norm, eps = Interval.get_perturbation(v[0])
            # L2 norm, h_U and h_L are the same.
            mid = h_U
            # TODO: padding
            deviation = torch.mul(weight, weight).sum((1, 2, 3)).sqrt() * eps
            deviation = deviation.unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
        else: # Here we calculate the L0 norm IBP bound using the bound proposed in [Certified Defenses for Adversarial Patches, ICLR 2020]
            raise NotImplementedError()

        center = F.conv_transpose2d(mid, weight, bias, stride=self.stride, padding=self.padding, dilation=self.dilation, groups=self.groups, output_padding=self.output_padding)

        upper = center + deviation
        lower = center - deviation
        return lower, upper

    def bound_forward(self, dim_in, *x):
        if self.is_input_perturbed(1) or self.is_input_perturbed(2):
            raise NotImplementedError("Weight perturbation for convolution layers has not been implmented.")

        weight = x[1].lb
        bias = x[2].lb if self.has_bias else None
        x = x[0]

        mid_w = (x.lw + x.uw) / 2
        mid_b = (x.lb + x.ub) / 2
        diff_w = (x.uw - x.lw) / 2
        diff_b = (x.ub - x.lb) / 2
        weight_abs = weight.abs()
        shape = mid_w.shape
        shape_wconv = [shape[0] * shape[1]] + list(shape[2:])
        deviation_w = self.F_convtranspose(
            diff_w.reshape(shape_wconv), weight_abs, None, output_padding=self.output_padding,
            stride=self.stride, padding=self.padding, dilation=self.dilation, groups=self.groups)
        deviation_b = self.F_convtranspose(
            diff_b, weight_abs, None, output_padding=self.output_padding,
            stride=self.stride, padding=self.padding, dilation=self.dilation, groups=self.groups)
        center_w = self.F_convtranspose(
            mid_w.reshape(shape_wconv), weight, output_padding=self.output_padding,
            stride=self.stride, padding=self.padding, dilation=self.dilation, groups=self.groups)
        center_b = self.F_convtranspose(
            mid_b, weight, bias, output_padding=self.output_padding,
            stride=self.stride, padding=self.padding, dilation=self.dilation, groups=self.groups)
        deviation_w = deviation_w.reshape(shape[0], -1, *deviation_w.shape[1:])
        center_w = center_w.reshape(shape[0], -1, *center_w.shape[1:])

        return LinearBound(
            lw = center_w - deviation_w,
            lb = center_b - deviation_b,
            uw = center_w + deviation_w,
            ub = center_b + deviation_b)


class BoundPad(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        if hasattr(attr, 'pads'):
            self.padding = attr['pads'][2:4] + attr['pads'][6:8]
        else:
            self.padding = [0, 0, 0, 0]
        self.value = attr.get('value', 0.0)
        assert self.padding == [0, 0, 0, 0]

    def forward(self, x, pad, value=0.0):
        # TODO: padding for 3-D or more dimensional inputs.
        assert x.ndim == 4
        # x[1] should be [0,0,pad_top,pad_left,0,0,pad_bottom,pad_right]
        assert pad[0] == pad[1] == pad[4] == pad[5] == 0
        pad = [int(pad[3]), int(pad[7]), int(pad[2]), int(pad[6])]
        final = F.pad(x, pad, value=value)
        self.padding, self.value = pad, value
        return final

    def interval_propagate(self, *v):
        l, u = zip(*v)
        return Interval.make_interval(self.forward(*l), self.forward(*u), v[0])

    def bound_backward(self, last_lA, last_uA, *x, **kwargs):
        # TODO: padding for 3-D or more dimensional inputs.
        left, right, top, bottom = self.padding
        def _bound_oneside(last_A):
            if last_A is None:
                return None
            assert type(last_A) is Patches or last_A.ndim == 5
            if type(last_A) is Patches:
                if isinstance(last_A.padding, tuple):
                    new_padding = (last_A.padding[0] + left, last_A.padding[1] + right, last_A.padding[2] + top, last_A.padding[3] + bottom)
                else:
                    new_padding = (last_A.padding + left, last_A.padding + right, last_A.padding + top, last_A.padding + bottom)
                return last_A.create_similar(padding=new_padding)
            else:
                shape = last_A.size()
                return last_A[:, :, :, top:(shape[3] - bottom), left:(shape[4] - right)]
        last_lA = _bound_oneside(last_lA)
        last_uA = _bound_oneside(last_uA)
        return [(last_lA, last_uA), (None, None), (None, None)], 0, 0

    def build_solver(self, *v, model, C=None, model_type="mip", solver_pkg="gurobi"):
        # e.g., last layer input gurobi vars (3,32,32)
        gvars_array = np.array(v[0])
        # pre_layer_shape (1,3,32,32)
        pre_layer_shape = np.expand_dims(gvars_array, axis=0).shape
        # this layer shape (1,3,35,35)
        this_layer_shape = self.output_shape
        # v1 = tensor([0, 0, 1, 1, 0, 0, 2, 2])
        # [0,0,pad_top,pad_left,0,0,pad_bottom,pad_right]
        # => [left, right, top, bottom]
        padding = [int(v[1][3]), int(v[1][7]), int(v[1][2]), int(v[1][6])]
        left, right, top, bottom = padding
        assert pre_layer_shape[2] + padding[0] + padding[1] == this_layer_shape[2]
        assert pre_layer_shape[3] + padding[2] + padding[3] == this_layer_shape[3]

        new_layer_gurobi_vars = []
        neuron_idx = 0
        for out_chan_idx in range(this_layer_shape[1]):
            out_chan_vars = []
            for out_row_idx in range(this_layer_shape[2]):
                out_row_vars = []
                row_pad = out_row_idx < left or out_row_idx >= this_layer_shape[2] - right
                for out_col_idx in range(this_layer_shape[3]):
                    col_pad = out_col_idx < top or out_col_idx >= this_layer_shape[3] - bottom
                    if row_pad or col_pad:
                        v = model.addVar(lb=0, ub=0,
                                    obj=0, vtype=grb.GRB.CONTINUOUS,
                                    name=f'pad{self.name}_{neuron_idx}')
                    else:
                        v = gvars_array[out_chan_idx, out_row_idx - left, out_col_idx - top]
                    # print(out_chan_idx, out_row_idx, out_col_idx, row_pad, col_pad, v.LB, v.UB)
                    neuron_idx += 1

                    out_row_vars.append(v)
                out_chan_vars.append(out_row_vars)
            new_layer_gurobi_vars.append(out_chan_vars)

        self.solver_vars = new_layer_gurobi_vars
        model.update()


class Conv2dGrad(Module):
    def __init__(self, fw_module, weight, stride, padding, dilation, groups):
        super().__init__()
        self.weight = weight
        self.dilation = dilation
        self.groups = groups
        self.fw_module = fw_module

        assert isinstance(stride, list) and stride[0] == stride[1]
        assert isinstance(padding, list) and padding[0] == padding[1]
        assert isinstance(dilation, list) and dilation[0] == dilation[1]
        self.stride = stride[0]
        self.padding = padding[0]
        self.dilation = dilation[0]

    def forward(self, grad_last):
        output_padding0 = (
            int(self.fw_module.input_shape[2])
            - (int(self.fw_module.output_shape[2]) - 1) * self.stride
            + 2 * self.padding - 1 - (int(self.weight.size()[2] - 1) * self.dilation))
        output_padding1 = (
            int(self.fw_module.input_shape[3])
            - (int(self.fw_module.output_shape[3]) - 1) * self.stride
            + 2 * self.padding - 1 - (int(self.weight.size()[3] - 1) * self.dilation))

        return Conv2dGradOp.apply(
            grad_last, self.weight, self.stride, self.padding, self.dilation,
            self.groups, output_padding0, output_padding1)


class Conv2dGradOp(Function):
    @staticmethod
    def symbolic(g, x, w, stride, padding, dilation, groups,
                 output_padding0, output_padding1):
        return g.op(
            'grad::Conv2d', x, w, stride_i=stride, padding_i=padding,
            dilation_i=dilation, groups_i=groups,
            output_padding0_i=output_padding0,
            output_padding1_i=output_padding1).setType(x.type())

    @staticmethod
    def forward(
            ctx, grad_last, w, stride, padding, dilation, groups, output_padding0,
            output_padding1):
        grad_shape = grad_last.shape
        grad = F.conv_transpose2d(
            grad_last.view(grad_shape[0], *grad_shape[1:]), w, None,
            stride=stride, padding=padding, dilation=dilation,
            groups=groups, output_padding=(output_padding0, output_padding1))

        grad = grad.view((grad_shape[0], *grad.shape[1:]))
        return grad


class BoundConv2dGrad(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.stride = attr['stride']
        self.padding = attr['padding']
        self.dilation = attr['dilation']
        self.groups = attr['groups']
        self.output_padding = [
            attr.get('output_padding0', 0),
            attr.get('output_padding1', 0)
        ]
        self.has_bias = len(inputs) == 3
        self.mode = options.get('conv_mode', 'matrix')
        self.patches_start = True

    def forward(self, *x):
        # x[0]: input, x[1]: weight, x[2]: bias if self.has_bias
        return F.conv_transpose2d(
            x[0], x[1], None,
            stride=self.stride, padding=self.padding, dilation=self.dilation,
            groups=self.groups, output_padding=self.output_padding)

    def bound_backward(self, last_lA, last_uA, *x, **kwargs):
        assert not self.is_input_perturbed(1)

        lA_y = uA_y = lA_bias = uA_bias = None
        weight = x[1].lower

        def _bound_oneside(last_A):
            if last_A is None:
                return None, 0

            if isinstance(last_A, torch.Tensor):
                shape = last_A.size()
                next_A = F.conv2d(
                    last_A.reshape(shape[0] * shape[1], *shape[2:]),
                    weight, None, stride=self.stride, padding=self.padding,
                    dilation=self.dilation, groups=self.groups)
                next_A = next_A.view(
                    shape[0], shape[1], *next_A.shape[1:])
                if self.has_bias:
                    sum_bias = (last_A.sum((3, 4)) * x[2].lower).sum(2)
                else:
                    sum_bias = 0
                return next_A, sum_bias
            elif isinstance(last_A, Patches):
                # Here we build and propagate a Patch object with
                # (patches, stride, padding)
                assert self.stride == 1, 'The patches mode only supports stride = 1'
                if last_A.identity == 1:
                    # create a identity patch
                    # [out_dim, batch, out_c, out_h, out_w, in_dim, in_c, in_h, in_w]
                    patch_shape = last_A.shape
                    if last_A.unstable_idx is not None:
                        # FIXME Somehow the usage of unstable_idx seems to have
                        # been changed, and the previous code is no longer working.
                        raise NotImplementedError(
                            'Sparse patches for '
                            'BoundConv2dGrad is not supported yet.')
                        output_shape = last_A.output_shape
                        patches = torch.eye(
                            patch_shape[0]).to(weight)
                        patches = patches.view([
                            patch_shape[0], 1, 1, 1, 1, patch_shape[0], 1, 1])
                        # [out_dim, bsz, out_c, out_h, out_w, out_dim, in_c, in_h, in_w]
                        patches = patches.expand([
                            patch_shape[0], patch_shape[1], patch_shape[2],
                            output_shape[2], output_shape[3],
                            patch_shape[0], 1, 1])
                        patches = patches.transpose(0, 1)
                        patches = patches[
                            :,torch.tensor(list(range(patch_shape[0]))),
                            last_A.unstable_idx[0], last_A.unstable_idx[1],
                            last_A.unstable_idx[2]]
                        patches = patches.transpose(0, 1)
                    else:
                        # out_dim * out_c
                        patches = torch.eye(patch_shape[0]).to(weight)
                        patches = patches.view([
                            patch_shape[0], 1, 1, 1, patch_shape[0], 1, 1])
                        patches = patches.expand(patch_shape)
                else:
                    patches = last_A.patches

                if self.has_bias:
                    # bias is x[2] (lower and upper are the same), and has
                    # shape (c,).
                    # Patches either has
                    # [out_dim, batch, out_c, out_h, out_w, out_dim, c, h, w]
                    # or [unstable_size, batch, out_dim, c, h, w].
                    # sum_bias has shape (out_dim, batch, out_c, out_h, out_w)
                    # or (unstable_size, batch).
                    sum_bias = torch.einsum(
                        'sb...ochw,c->sb...', patches, x[2].lower)
                else:
                    sum_bias = 0

                flattened_patches = patches.reshape(
                    -1, patches.size(-3), patches.size(-2), patches.size(-1))
                # Pad to the full size
                pieces = F.conv2d(
                    flattened_patches, weight, stride=self.stride,
                    padding=weight.shape[2]-1)
                # New patch size:
                # (out_c, batch, out_h, out_w, c, h, w)
                # or (unstable_size, batch, c, h, w).
                pieces = pieces.view(
                    *patches.shape[:-3], pieces.size(-3), pieces.size(-2),
                    pieces.size(-1))

                # (left, right, top, bottom)
                padding = last_A.padding if last_A is not None else (0, 0, 0, 0)
                stride = last_A.stride if last_A is not None else 1

                if isinstance(padding, int):
                    padding = padding + weight.shape[2] - 1
                else:
                    padding = tuple(p + weight.shape[2] - 1 for p in padding)

                return Patches(
                    pieces, stride, padding, pieces.shape,
                    unstable_idx=last_A.unstable_idx,
                    output_shape=last_A.output_shape), sum_bias
            else:
                raise NotImplementedError()

        lA_x, lbias = _bound_oneside(last_lA)
        uA_x, ubias = _bound_oneside(last_uA)
        return [(lA_x, uA_x), (lA_y, uA_y), (lA_bias, uA_bias)], lbias, ubias

    def interval_propagate(self, *v, C=None):
        assert not self.is_input_perturbed(1)

        norm = Interval.get_perturbation(v[0])[0]
        h_L, h_U = v[0]

        weight = v[1][0]
        bias = v[2][0] if self.has_bias else None

        if norm == torch.inf:
            mid = (h_U + h_L) / 2.0
            diff = (h_U - h_L) / 2.0
            weight_abs = weight.abs()
            deviation = F.conv_transpose2d(
                diff, weight_abs, None, stride=self.stride,
                padding=self.padding, dilation=self.dilation,
                groups=self.groups, output_padding=self.output_padding)
        else:
            raise NotImplementedError
        center = F.conv_transpose2d(
            mid, weight, bias, stride=self.stride, padding=self.padding,
            dilation=self.dilation, groups=self.groups,
            output_padding=self.output_padding)
        upper = center + deviation
        lower = center - deviation
        return lower, upper


================================================
FILE: auto_LiRPA/operators/cut_ops.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
""" Cut operators"""
from .base import *
from .clampmult import multiply_by_A_signs


class CutModule():
    # store under BoundedModule
    def __init__(self, relu_nodes=[], general_beta=None, x_coeffs=None,
                 active_cuts=None, cut_bias=None):
        # all dict, storing cut parameters for each start node
        # {start node name: (2 (lA, uA), spec (out_c, out_h, out_w), batch, num_cuts)}
        self.general_beta = general_beta
        # {start node name: (# active cut constraints,)}
        self.active_cuts = active_cuts

        # all dict with tensor, storing coeffs for each relu layer, no grad
        # coeffs: {relu layername: (num_cuts, flattened_nodes)}
        self.relu_coeffs, self.arelu_coeffs, self.pre_coeffs = {}, {}, {}
        for m in relu_nodes:
            self.relu_coeffs[m.name] = self.arelu_coeffs[m.name] = self.pre_coeffs[m.name] = None

        # single tensor, always the same, no grad
        # bias: (num_cuts,)
        self.cut_bias = cut_bias
        # x_coeffs: (num_cuts, flattened input dims)
        self.x_coeffs = x_coeffs

    def use_patches(self, start_node):
        # check if we are using patches mode for the start node
        A = start_node.lA if start_node.lA is not None else start_node.uA
        return type(A) is Patches

    def select_active_general_beta(self, start_node, unstable_idx=None):
        # if one constraint have nodes deeper than start node, we do not count its effect for now
        # self.general_beta[start_node.name]: (2(0 lower, 1 upper), spec (out_c, out_h, out_w/# fc nodes), batch, num_constrs)
        # self.active_cuts[start_node.name]: a long() tensor with constraint index that
        # should be index on current layer with current start node
        if self.general_beta[start_node.name].ndim == 4:
            general_beta = self.general_beta[start_node.name][:, :, :, self.active_cuts[start_node.name]]
        elif self.general_beta[start_node.name].ndim == 6:
            general_beta = self.general_beta[start_node.name][:, :, :, :, :, self.active_cuts[start_node.name]]
        else:
            print("general beta shape not supported!")
            exit()
        if unstable_idx is not None:
            if self.use_patches(start_node):
                general_beta = general_beta[:, unstable_idx[0], unstable_idx[1], unstable_idx[2], :, :]
            else:
                # matrix mode
                if general_beta.ndim == 6:
                    # conv layers general_beta: (2(0 lower, 1 upper), spec (out_c, out_h, out_w), batch, num_constrs)
                    _, out_c, out_h, out_w, batch, num_constrs = general_beta.shape
                    general_beta = general_beta.view(2, -1, batch, num_constrs)
                else:
                    # dense layers general_beta: (2(0 lower, 1 upper), spec, batch, num_constrs)
                    pass
                general_beta = general_beta[:, unstable_idx]
        else:
            # unstable_idx is None
            if general_beta.ndim == 6:
                # flatten spec layer shape
                _, out_c, out_h, out_w, batch, num_constrs = general_beta.shape
                general_beta = general_beta.view(2, -1, batch, num_constrs)
        return general_beta

    def general_beta_coeffs_mm(self, unstable_spec_beta, coeffs, A, current_layer_shape):
        if type(A) is Patches:
            # lA, uA are patches, we have to unfold beta and coeffs to match lA and uA
            # coeffs: (num_constrs, current_c, current_h, current_w)
            # coeffs_unfolded: (num_constrs, out_h, out_w, in_c, H, W)
            # current_layer_shape = x.lower.size()[1:]
            coeffs_unfolded = inplace_unfold(coeffs.view(-1, *current_layer_shape), \
                                             kernel_size=A.patches.shape[-2:], padding=A.padding, stride=A.stride)
            # unstable_coeffs_unfolded: (num_constrs, unstable, in_c, H, W)
            # A.unstable_idx is the unstable idx for spec layer
            unstable_coeffs_unfolded = coeffs_unfolded[:, A.unstable_idx[1], A.unstable_idx[2], :, :, :]
            # A.unstable_idx: unstable index on out_c, out_h and out_w
            # general_beta: (2(0 lower, 1 upper), spec (out_c, out_h, out_w), batch, num_constrs)
            # unstable_spec_beta: (2(0 lower, 1 upper), unstable, batch, num_constrs)
            # unstable_spec_beta = general_beta[:, A.unstable_idx[0],\
            #             A.unstable_idx[1], A.unstable_idx[2], :, :]
            # beta_mm_coeffs_unfolded: (2(0 lower, 1 upper), unstable, batch, in_c, H, W)
            beta_mm_coeffs = torch.einsum('sihj,jiabc->sihabc', unstable_spec_beta, unstable_coeffs_unfolded)
        else:
            # unstable_spec_beta: (2(0 lower, 1 upper), unstable, batch, num_constrs)
            # coeffs: (num_constrs, current flattened layer nodes)
            # beta_mm_coeffs: (2(0 lower, 1 upper), unstable, batch, current flattened layer nodes)
            beta_mm_coeffs = torch.einsum('sihj,ja->siha', unstable_spec_beta, coeffs)
            assert beta_mm_coeffs[0].numel() == A.numel(), f"the shape of beta is not initialized correctly! {beta_mm_coeffs[0].shape} v.s. {A.shape}"
        return beta_mm_coeffs.reshape(2, *A.shape)

    def general_beta_coeffs_addmm_to_A(self, lA, uA, general_beta, coeffs, current_layer_shape):
        A = lA if lA is not None else uA
        # general_beta: (2(0 lower, 1 upper), spec (out_c, out_h, out_w), batch, num_constrs)
        # coeffs: (num_constrs, current_c, current_h, current_w)
        # beta_mm_coeffs[0] shape is the same as A
        # patches mode: (2(0 lower, 1 upper), unstable, batch, in_c, H, W)
        # not patches: (2(0 lower, 1 upper), unstable, batch, current flattened layer nodes)
        beta_mm_coeffs = self.general_beta_coeffs_mm(general_beta, coeffs, A, current_layer_shape)
        assert beta_mm_coeffs[0].shape == A.shape
        if type(A) is Patches:
            # lA, uA are patches, we have to unfold beta and coeffs to match lA and uA
            # lA_patches: (unstable, batch, in_c, H, W)
            if lA is not None:
                lA = Patches(lA.patches - beta_mm_coeffs[0], A.stride, A.padding, \
                             A.patches.shape, unstable_idx=A.unstable_idx, output_shape=A.output_shape)
            if uA is not None:
                uA = Patches(uA.patches + beta_mm_coeffs[1], A.stride, A.padding, \
                             A.patches.shape, unstable_idx=A.unstable_idx, output_shape=A.output_shape)
        else:
            # dense layers
            if lA is not None:
                lA = lA - beta_mm_coeffs[0]
            if uA is not None:
                uA = uA + beta_mm_coeffs[1]
        return lA, uA

    def patch_trick(self, start_node, layer_name, A, current_layer_shape):
        ######## A problem with patches mode for cut constraint start ##########
        # There are cases that the node that is in the constraint but not selected by the patches for the output node
        # trick: only count the small patches that have all the split node coeffs[ci].sum() equal to coeffs_unfolded[ci][out_h, out_w, -1].sum()
        # we should force these beta to be 0 to disable the effect of these constraints
        # this only apply if current layer uses patches mode; if the target layer is patches but current layer not, we should not use it!
        assert type(A) is Patches, "this trick fix only works for patches mode"
        # unstable_spec_beta stores the current propagation, self.general_beta[start_node.name] selected with active_cuts, spec unstable
        coeffs = 0
        if layer_name != "input":
            if self.relu_coeffs[layer_name] is not None:
                coeffs = coeffs + self.relu_coeffs[layer_name]
            if self.arelu_coeffs[layer_name] is not None:
                coeffs = coeffs + self.arelu_coeffs[layer_name]
            if self.pre_coeffs[layer_name] is not None:
                coeffs = coeffs + self.pre_coeffs[layer_name]
        else:
            if self.x_coeffs is not None:
                coeffs = coeffs + self.x_coeffs
        coeffs_unfolded = inplace_unfold(coeffs.view(-1, *current_layer_shape), \
                                         kernel_size=A.patches.shape[-2:], padding=A.padding, stride=A.stride)
        num_constrs, out_h, out_w, in_c, H, W = coeffs_unfolded.shape
        # make sure the small patch selected include all the nonzero coeffs
        ####### NOTE: This check could be costly #######
        patch_mask_on_beta = (coeffs_unfolded.reshape(num_constrs, out_h, out_w, -1).abs().sum(-1) == \
                              coeffs.reshape(num_constrs, -1).abs().sum(-1).reshape(num_constrs, 1, 1))
        # patch_mask_on_beta: (out_h, out_w, num_constrs)
        patch_mask_on_beta = patch_mask_on_beta.permute(1, 2, 0)
        # 2(lower, upper), out_c, out_h, out_w, batch, num_constrs
        patch_mask_on_beta = patch_mask_on_beta.reshape(1, 1, out_h, out_w, 1, num_constrs)
        self.general_beta[start_node.name].data = self.general_beta[start_node.name].data * patch_mask_on_beta

    def relu_cut(self, start_node, layer_name, last_lA, last_uA, current_layer_shape, unstable_idx=None, batch_mask=None):
        # propagate relu neuron in cut constraints through relu layer
        # start_node.name in self.general_beta means there are intermediate betas that can optimize this start node separately
        relu_coeffs = self.relu_coeffs[layer_name]
        active_cuts = self.active_cuts[start_node.name]
        # active_cuts.size(0) == 0 means all constraints containing this layer have deep layer nodes
        if relu_coeffs is None or active_cuts.size(0) == 0:
            # do nothing
            return last_lA, last_uA
        assert start_node.name in self.general_beta
        # select current relu layer general beta
        general_beta = self.select_active_general_beta(start_node, unstable_idx)
        relu_coeffs = relu_coeffs[active_cuts]
        if batch_mask is not None:
            general_beta = general_beta[:, :, batch_mask]
        last_lA, last_uA = self.general_beta_coeffs_addmm_to_A(last_lA, last_uA, general_beta,
                                                               relu_coeffs, current_layer_shape)
        return last_lA, last_uA

    def pre_cut(self, start_node, layer_name, lA, uA, current_layer_shape, unstable_idx=None, batch_mask=None):
        # propagate prerelu neuron in cut constraints through relu layer
        # start_node.name in self.general_beta means there are intermediate betas that can optimize this start node separately
        pre_coeffs = self.pre_coeffs[layer_name]
        active_cuts = self.active_cuts[start_node.name]
        # active_cuts.size(0) == 0 means all constraints containing this layer have deep layer nodes
        if pre_coeffs is None or active_cuts.size(0) == 0:
            # do nothing
            return lA, uA
        general_beta = self.select_active_general_beta(start_node, unstable_idx)
        pre_coeffs = pre_coeffs[active_cuts]
        if batch_mask is not None:
            general_beta = general_beta[:, :, batch_mask]
        lA, uA = self.general_beta_coeffs_addmm_to_A(lA, uA, general_beta, pre_coeffs, current_layer_shape)
        return lA, uA


    @staticmethod
    @torch.jit.script
    def jit_arelu_lA(last_lA, lower, upper, beta_mm_coeffs, unstable_or_cut_index, upper_d, I_z1, I_z0):
        nu_hat_pos = last_lA.clamp(max=0.).abs()
        # gamma = (-lower.unsqueeze(0) * nu_hat_pos - beta_mm_coeffs[0]) / (upper.unsqueeze(0) - lower.unsqueeze(0) + 1e-10)
        pi = (upper.unsqueeze(0) * nu_hat_pos + beta_mm_coeffs[0]) / (upper.unsqueeze(0) - lower.unsqueeze(0) + 1e-10)
        pi = torch.min(pi, nu_hat_pos)#, torch.min(gamma, nu_hat_pos)
        pi = pi.clamp(min=0.)#, gamma.clamp(min=0.)
        pi = nu_hat_pos * I_z1 + pi * (~I_z1 * ~I_z0)
        new_upper_d = pi / (nu_hat_pos + 1e-10)
        # need to customize the upper bound slope and lbias for (1) unstable relus and
        # (2) relus that are used with upper boundary relaxation
        # original upper bound slope is u/(u-l) also equal to pi/(pi+gamma) if no beta_mm_coeffs[0]
        # now the upper bound slope should be pi/(pi+gamma) updated with beta_mm_coeffs[0]
        unstable_upper_bound_index = unstable_or_cut_index.unsqueeze(0).logical_and(last_lA < 0)
        # conv layer:
        # upper_d: 1, batch, current_c, current_w, current_h
        # unstable_upper_bound_index, new_upper_d: spec unstable, batch, current_c, current_w, current_h
        # dense layer:
        # upper_d: 1, batch, current flattened nodes
        # unstable_upper_bound_index, new_upper_d: spec unstable, batch, current flattened nodes
        # we may need a new mask to filter out the unstable nodes that are not in the current layer
        new_upper_d = (new_upper_d * unstable_upper_bound_index.to(lower.dtype) +
                      upper_d * (1. - unstable_upper_bound_index.to(lower.dtype)))
        return nu_hat_pos, pi, new_upper_d, unstable_upper_bound_index

    @staticmethod
    @torch.jit.script
    def jit_arelu_lbias(unstable_or_cut_index, nu_hat_pos, beta_mm_coeffs, lower, upper, lbias, pi, I_z1, I_z0):
        # if no unstable, following bias should always be 0
        if unstable_or_cut_index.sum() > 0:
            # update lbias with new form, only contribued by unstable relus
            uC = -upper.unsqueeze(0) * nu_hat_pos
            lC = -lower.unsqueeze(0) * nu_hat_pos
            # lbias: (spec unstable, batch, current flattened nodes) same as lA
            lbias = (pi * lower.unsqueeze(0))

            # previous implementation
            # uC_mask = (beta_mm_coeffs[0] <= uC).to(lbias)
            # lC_mask = (beta_mm_coeffs[0] >= lC).to(lbias)

            # complete implementation
            uC_mask = ((beta_mm_coeffs[0] <= uC) | I_z0).to(lbias)
            lC_mask = ((beta_mm_coeffs[0] >= lC) | I_z1).to(lbias)
            default_mask = ((1-uC_mask) * (1-lC_mask)).to(lbias)
            lbias = - beta_mm_coeffs[0].to(lbias) * lC_mask + lbias * default_mask

            # lbias[beta_mm_coeffs[0] <= uC] = 0.
            # lbias[beta_mm_coeffs[0] >= lC] = -beta_mm_coeffs[0][beta_mm_coeffs[0] >= lC].to(lbias)

            # final lbias: (spec unstable, batch)
            lbias = (lbias * unstable_or_cut_index.unsqueeze(0).to(lower.dtype)).view(lbias.shape[0], lbias.shape[1], -1).sum(-1)
        return lbias

    @staticmethod
    @torch.jit.script
    def jit_arelu_uA(last_uA, lower, upper, beta_mm_coeffs, unstable_or_cut_index, upper_d, I_z1, I_z0):
        nu_hat_pos = (-last_uA).clamp(max=0.).abs()
        # gamma = (- lower.unsqueeze(0) * nu_hat_pos - beta_mm_coeffs[1]) / (upper.unsqueeze(0) - lower.unsqueeze(0) + 1e-10)
        pi = (upper.unsqueeze(0) * nu_hat_pos + beta_mm_coeffs[1]) / (upper.unsqueeze(0) - lower.unsqueeze(0) + 1e-10)
        pi = pi.clamp(min=0.)
        pi = torch.min(pi, nu_hat_pos)
        pi = pi * I_z1 + nu_hat_pos * (~I_z1 * ~I_z0)
        new_upper_d = pi / (nu_hat_pos + 1e-10)

        # assert ((gamma + pi - nu_hat_pos).abs()*unstable_or_cut_index).max() <= 1e-5, "pi+gamma should always be the same as nu_hat_pos"

        # unstable_or_cut_index = self.I.logical_or(self.arelu_coeffs.abs().sum(0).view(self.I.shape) != 0)
        unstable_upper_bound_index = unstable_or_cut_index.unsqueeze(0).logical_and(-last_uA < 0)
        new_upper_d = new_upper_d * unstable_upper_bound_index.to(lower.dtype) + \
                      upper_d * (1. - unstable_upper_bound_index.to(lower.dtype))
        return nu_hat_pos, pi, new_upper_d, unstable_upper_bound_index

    @staticmethod
    @torch.jit.script
    def jit_arelu_ubias(unstable_or_cut_index, nu_hat_pos, beta_mm_coeffs, lower, upper, ubias, pi, I_z1, I_z0):
        if unstable_or_cut_index.sum() > 0:
            uC = -upper.unsqueeze(0) * nu_hat_pos
            lC = -lower.unsqueeze(0) * nu_hat_pos
            ubias = -(pi * lower.unsqueeze(0))

            # uC_mask = (beta_mm_coeffs[1] <= uC).to(ubias)
            # lC_mask = (beta_mm_coeffs[1] >= lC).to(ubias)
            uC_mask = ((beta_mm_coeffs[1] <= uC) | I_z0).to(ubias)
            lC_mask = ((beta_mm_coeffs[1] >= lC) | I_z1).to(ubias)

            default_mask = ((1-uC_mask) * (1-lC_mask)).to(ubias)
            ubias = beta_mm_coeffs[1].to(ubias) * lC_mask + ubias * default_mask

            # ubias[beta_mm_coeffs[1] <= uC] = 0.
            # ubias[beta_mm_coeffs[1] >= lC] = beta_mm_coeffs[1][beta_mm_coeffs[1] >= lC].to(ubias)

            ubias = (ubias * unstable_or_cut_index.unsqueeze(0).to(lower.dtype)).view(ubias.shape[0], ubias.shape[1], -1).sum(-1)
        return ubias


    def arelu_cut(self, start_node, layer_name, last_lA, last_uA, lower_d, upper_d,
                  lower_b, upper_b, lb_lower_d, ub_lower_d, relu_indicators, x, patch_size,
                  current_layer_shape, unstable_idx=None, batch_mask=None):
        """
        We want to calculate the pi and gamma for the lower bound of the next layer.
        To make the GCP CROWN complete, we have to consider the case when z is a constant.
        Now discuss the case when z = 0, z = 1 (constant), and 0 < z < 1 (variable).
            lbias is h(beta) in the paper.
            upper_d is the upper bound slope of the current layer.
        1. z = 0 -> pi = 0, gamma = nu_hat_pos, tao = 0, mu = (alpha) * nu_hat_neg
            lbias = 0.
            upper_d = pi / (pi + gamma) = 0.
        2. z = 1 -> pi = nu_hat_pos, gamma = 0, tao = alpha * nu_hat_neg, mu = 0
            lbias = - beta_mm_coeffs[0].
            upper_d = pi / (pi + gamma) = 1.
        3. 0 < z < 1. We do the regular calculation using the closed form solution.
            lbias = pi * lower, if -upper * nu_hat_pos <= beta_mm_coeffs[0] <= -lower * nu_hat_pos
            lbias = 0, if beta_mm_coeffs[0] <= -upper * nu_hat_pos
            lbias = -beta_mm_coeffs[0], if beta_mm_coeffs[0] >= -lower * nu_hat_pos
            upper_d = pi / (nu_hat_pos).
            where
                pi = (upper * nu_hat_pos + beta_mm_coeffs[0]) / (upper - lower),
                pi = min(pi, nu_hat_pos),
                pi = max(pi, 0),
                gamma = (-lower * nu_hat_pos - beta_mm_coeffs[0]) / (upper - lower).
                gamma = min(gamma, nu_hat_pos),
                gamma = max(gamma, 0).
        Thus, we have the following implementation.
        if z = 0:
            pi = 0.
        if z = 1:
            pi = nu_hat_pos.
        Otherwise:
            if -upper * nu_hat_pos <= beta_mm_coeffs[0] <= -lower * nu_hat_pos:
                pi = (upper * nu_hat_pos + beta_mm_coeffs[0]) / (upper - lower),
                pi = min(pi, nu_hat_pos),
                pi = max(pi, 0),
                lbias = pi * lower,
                upper_d = pi / (nu_hat_pos).
            if beta_mm_coeffs[0] <= -upper * nu_hat_pos:
                lbias = 0.
            if beta_mm_coeffs[0] >= -lower * nu_hat_pos:
                lbias = -beta_mm_coeffs[0].
        """
        # propagate integer var of relu neuron (arelu) in cut constraints through relu layer
        # I[0]. unstable neuron mask.
        # I[1]. previous unstable now split on z = 1.
        # I[2]. previous unstable now split on z = 0.
        unstable_neurons_mask, z_split_to_1_mask, z_split_to_0_mask = relu_indicators
        arelu_coeffs = self.arelu_coeffs[layer_name]
        active_cuts = self.active_cuts[start_node.name]
        # active_cuts.size(0) == 0 means all constraints containing this layer have deep layer nodes
        if arelu_coeffs is None or active_cuts.size(0) == 0:
            # do regular propagation without cut
            uA, ubias = _bound_oneside(last_uA, upper_d, ub_lower_d if lower_d is None else lower_d, upper_b, lower_b, start_node, patch_size)
            lA, lbias = _bound_oneside(last_lA, lb_lower_d if lower_d is None else lower_d, upper_d, lower_b, upper_b, start_node, patch_size)
            return lA, uA, lbias, ubias

        # general_beta: (2(0 lower, 1 upper), spec (out_c, out_h, out_w), batch, num_constrs)
        general_beta = self.select_active_general_beta(start_node, unstable_idx)
        # arelu_coeffs: (num_constrs, flattened current layer nodes)
        arelu_coeffs = arelu_coeffs[active_cuts]
        if batch_mask is not None:
            general_beta = general_beta[:, :, batch_mask]
        A = last_lA if last_lA is not None else last_uA
        # beta_mm_coeffs[0] shape is the same as A
        # patches mode: (2(0 lower, 1 upper), unstable, batch, in_c, H, W)
        # not patches: (2(0 lower, 1 upper), unstable, batch, current flattened layer nodes)
        beta_mm_coeffs = self.general_beta_coeffs_mm(general_beta, arelu_coeffs, A, current_layer_shape)
        # unstable_this_layer = torch.logical_and(x.lower < 0, x.upper > 0).unsqueeze(0)
        # relu_indicator is the unstable index in this relu layer: (batch, *layer shape)
        # if there is node in cut constraint that is stable, also need to count its effect
        # self.arelu_coeffs: (num_constrs, flattened current layer)
        # self.arelu_coeffs do not have a batch dimension - only one cut can be applied to all batch elements.
        # We will handle the neurons which are unstable or those have cut constraints below, thus creating the mask.
        unstable_or_cut_index = unstable_neurons_mask.logical_or(arelu_coeffs.abs().sum(0).view(unstable_neurons_mask[0:1].shape) != 0)
        # Shape of unstable_or_cut_index is (batch, num_neurons). It is a binary mask.

        if type(A) is Patches:
            # patches mode, conv layer only
            # x.lower (always regular shape): batch, current_c, current_h, current_w
            # x_lower_unfold: unstable, batch, in_C, H, W (same as patches last_lA)
            x_lower_unfold = _maybe_unfold(x.lower.unsqueeze(0), A)
            x_upper_unfold = _maybe_unfold(x.upper.unsqueeze(0), A)
            # first minus upper and lower and then unfold to patch size will save memory
            x_upper_minus_lower_unfold = _maybe_unfold((x.upper - x.lower).unsqueeze(0), A)
            ####### be careful with the unstable_this_layer and unstable_idx #######
            # unstable_this_layer is the unstable index in this layer
            # unstable_idx is the unstable index in spec layer
            # unstable_this_layer: spec unstable, batch, in_C, H, W (same as patches last_lA)
            # unstable_this_layer = torch.logical_and(x_lower_unfold < 0, x_upper_unfold > 0)
            # unstable_this_layer = _maybe_unfold(self.I.unsqueeze(0), A)
            unstable_or_cut_index = _maybe_unfold(unstable_or_cut_index.unsqueeze(0), A)
            if last_lA is not None:
                assert beta_mm_coeffs[0].shape == last_lA.shape, f"{beta_mm_coeffs[0].shape} != {last_lA.shape}"
                # last_lA.patches, nu_hat_pos, gamma, pi: (unstable, batch, in_c, H, W)
                nu_hat_pos = last_lA.patches.clamp(max=0.).abs()
                # gamma = (-x_lower_unfold * nu_hat_pos - beta_mm_coeffs[0]) / (x_upper_minus_lower_unfold.clamp(min=1e-10))
                pi = (x_upper_unfold * nu_hat_pos + beta_mm_coeffs[0]) / (x_upper_minus_lower_unfold.clamp(min=1e-10))
                pi = torch.min(pi, nu_hat_pos).clamp(min=0.)
                pi = nu_hat_pos * z_split_to_1_mask + pi * (~z_split_to_1_mask * ~z_split_to_0_mask)
                new_upper_d = pi / (nu_hat_pos + 1e-10)

                # assert ((gamma + pi - nu_hat_pos).abs()*unstable_or_cut_index).max() <= 1e-5, "pi+gamma should always be the same as nu_hat_pos"

                # unstable_upper_bound_index: spec unstable, batch, in_C, H, W (same as patches last_lA)
                unstable_upper_bound_index = unstable_or_cut_index.logical_and(last_lA.patches < 0)
                # upper_d: (spec unstable, 1, in_C, H, W) (unfolded shape, same as patches last_lA)
                new_upper_d = new_upper_d * unstable_upper_bound_index.to(x_lower_unfold.dtype) + \
                              upper_d * (1. - unstable_upper_bound_index.to(x_lower_unfold.dtype))

                if last_uA is None: uA, ubias = None, 0.
                # lbias: unstable, batch
                # lA: unstable, batch, in_C, H, W (same as patches last_lA)
                lA, lbias = _bound_oneside(last_lA, lb_lower_d if lower_d is None else lower_d, new_upper_d, lower_b, upper_b, start_node, patch_size)

                # if general_beta[0].sum()!=0: import pdb; pdb.set_trace()
                # there is any unstable relus in this layer
                if unstable_or_cut_index.sum() > 0:
                    uC = -x_upper_unfold * nu_hat_pos
                    lC = -x_lower_unfold * nu_hat_pos
                    lbias = (pi * x_lower_unfold)
                    # lbias[beta_mm_coeffs[0] <= uC] = 0.
                    # lbias[beta_mm_coeffs[0] >= lC] = -beta_mm_coeffs[0][beta_mm_coeffs[0] >= lC].to(lbias)
                    lbias[(beta_mm_coeffs[0] <= uC)| z_split_to_0_mask] = 0.
                    lbias[(beta_mm_coeffs[0] >= lC)| z_split_to_1_mask] = -beta_mm_coeffs[0][(beta_mm_coeffs[0] >= lC)| z_split_to_1_mask].to(lbias)
                    # lbias: unstable, batch, in_C, H, W (same as patches last_lA) => lbias: (unstable, batch)
                    lbias = (lbias * unstable_or_cut_index.to(x_lower_unfold.dtype)).view(lbias.shape[0], lbias.shape[1], -1).sum(-1)

            if last_uA is not None:
                # get the upper bound
                nu_hat_pos = (-last_uA.patches).clamp(max=0.).abs()
                # gamma = (-x_lower_unfold * nu_hat_pos - beta_mm_coeffs[1]) / (x_upper_minus_lower_unfold + 1e-10)
                pi = (x_upper_unfold * nu_hat_pos + beta_mm_coeffs[1]) / (x_upper_minus_lower_unfold + 1e-10)
                pi = torch.min(pi, nu_hat_pos).clamp(min=0.)
                pi = nu_hat_pos * z_split_to_1_mask + pi * (~z_split_to_1_mask * ~z_split_to_0_mask)
                new_upper_d = pi / (nu_hat_pos + 1e-10)

                # assert ((gamma + pi - nu_hat_pos).abs()*unstable_or_cut_index).max() <= 1e-5, "pi+gamma should always be the same as nu_hat_pos"

                unstable_upper_bound_index = unstable_or_cut_index.logical_and((-last_uA.patches) < 0)
                new_upper_d = new_upper_d * unstable_upper_bound_index.to(x_lower_unfold.dtype) + \
                              upper_d * (1. - unstable_upper_bound_index.to(x_lower_unfold.dtype))

                uA, ubias = _bound_oneside(last_uA, new_upper_d, ub_lower_d if lower_d is None else lower_d, upper_b, lower_b, start_node, patch_size)
                if last_lA is None: lA, lbias = None, 0.

                if unstable_or_cut_index.sum() > 0:
                    uC = -x_upper_unfold * nu_hat_pos
                    lC = -x_lower_unfold * nu_hat_pos
                    ubias = -(pi * x_lower_unfold)
                    # ubias[beta_mm_coeffs[1] <= uC] = 0.
                    # ubias[beta_mm_coeffs[1] >= lC] = beta_mm_coeffs[1][beta_mm_coeffs[1] >= lC].to(ubias)
                    ubias[(beta_mm_coeffs[1] <= uC) | z_split_to_0_mask] = 0.
                    ubias[(beta_mm_coeffs[1] >= lC) | z_split_to_1_mask] = beta_mm_coeffs[1][(beta_mm_coeffs[1] >= lC) | z_split_to_1_mask].to(ubias)
                    # ubias: unstable, batch, in_C, H, W (same as patches last_uA) => ubias: (unstable, batch)
                    ubias = (ubias * unstable_or_cut_index.to(x_lower_unfold.dtype)).view(ubias.shape[0], ubias.shape[1], -1).sum(-1)
        else:
            # dense
            if last_lA is not None:
                # #####################
                # # C is nu_hat_pos
                # # last_lA: (spec unstable, batch, current flattened nodes (current_c*current_h*current_w))
                # nu_hat_pos = last_lA.clamp(max=0.).abs()
                # # pi, gamma: spec_unstable, batch, current layer shape (same as last_lA)

                # # need to customize the upper bound slope and lbias for (1) unstable relus and
                # # (2) relus that are used with upper boundary relaxation
                # # original upper bound slope is u/(u-l) also equal to pi/(pi+gamma) if no beta_mm_coeffs[0]
                # # now the upper bound slope should be pi/(p+gamma) updated with beta_mm_coeffs[0]

                # # conv layer:
                # # upper_d: 1, batch, current_c, current_w, current_h
                # # unstable_upper_bound_index, new_upper_d: spec unstable, batch, current_c, current_w, current_h
                # # dense layer:
                # # upper_d: 1, batch, current flattened nodes
                # # unstable_upper_bound_index, new_upper_d: spec unstable, batch, current flattened nodes

                nu_hat_pos, pi, new_upper_d, unstable_upper_bound_index = self.jit_arelu_lA(last_lA, x.lower, x.upper, beta_mm_coeffs, unstable_or_cut_index, upper_d, z_split_to_1_mask, z_split_to_0_mask)

                if last_uA is None: uA, ubias = None, 0.
                lA, lbias = _bound_oneside(last_lA, lb_lower_d if lower_d is None else lower_d, new_upper_d, lower_b, upper_b, start_node, patch_size)
                lbias = self.jit_arelu_lbias(unstable_or_cut_index, nu_hat_pos, beta_mm_coeffs, x.lower, x.upper, lbias, pi, z_split_to_1_mask, z_split_to_0_mask)

            if last_uA is not None:
                # # C is nu_hat_pos
                nu_hat_pos, pi, new_upper_d, unstable_upper_bound_index = self.jit_arelu_uA(last_uA, x.lower, x.upper, beta_mm_coeffs, unstable_or_cut_index, upper_d, z_split_to_1_mask, z_split_to_0_mask)

                # one can test uA by optimize -obj which should have the same obj value
                uA, ubias = _bound_oneside(last_uA, new_upper_d, ub_lower_d if lower_d is None else lower_d, upper_b, lower_b, start_node, patch_size)
                if last_lA is None: lA, lbias = None, 0.
                ubias = self.jit_arelu_ubias(unstable_or_cut_index, nu_hat_pos, beta_mm_coeffs, x.lower, x.upper, ubias, pi, z_split_to_1_mask, z_split_to_0_mask)

        return lA, uA, lbias, ubias

    def input_cut(self, start_node, lA, uA, current_layer_shape, unstable_idx=None, batch_mask=None):
        # propagate input neuron in cut constraints through relu layer
        active_cuts = self.active_cuts[start_node.name]
        if self.x_coeffs is None or active_cuts.size(0) == 0:
            return lA, uA

        if type(lA) is Patches:
            A = lA if lA is not None else uA
            self.patch_trick(start_node, "input", A, current_layer_shape)

        general_beta = self.select_active_general_beta(start_node, unstable_idx)
        x_coeffs = self.x_coeffs[active_cuts]
        if batch_mask is not None:
            general_beta = general_beta[:, :, batch_mask]
        # general_beta: (2(0 lower, 1 upper), spec, batch, num_constrs)
        # x_coeffs: (num_constrs, flattened input dims)
        # beta_bias: (2(0 lower, 1 upper), batch, spec)
        lA, uA = self.general_beta_coeffs_addmm_to_A(lA, uA, general_beta, x_coeffs, current_layer_shape)
        return lA, uA

    def bias_cut(self, start_node, lb, ub, unstable_idx=None, batch_mask=None):
        active_cuts = self.active_cuts[start_node.name]
        if self.cut_bias is None or active_cuts.size(0) == 0:
            return lb, ub
        bias_coeffs = self.cut_bias[active_cuts]
        general_beta = self.select_active_general_beta(start_node, unstable_idx)
        if batch_mask is not None:
            general_beta = general_beta[:, :, batch_mask]
        # add bias for the bias term of general cut
        # general_beta: (2(0 lower, 1 upper), spec, batch, num_constrs)
        # bias_coeffs: (num_constrs,)
        # beta_bias: (2(0 lower, 1 upper), batch, spec)
        beta_bias = torch.einsum('sihj,j->shi', general_beta.to(lb.dtype), bias_coeffs.to(lb.dtype))
        lb = lb + beta_bias[0] if lb is not None else None
        ub = ub - beta_bias[1] if ub is not None else None
        return lb, ub


# Choose upper or lower bounds based on the sign of last_A
# this is a copy from activation.py
def _bound_oneside(last_A, d_pos, d_neg, b_pos, b_neg, start_node, patch_size):
    if last_A is None:
        return None, 0
    if type(last_A) == Tensor:
        A, bias = multiply_by_A_signs(last_A, d_pos, d_neg, b_pos, b_neg, contiguous=True)
        return A, bias
    elif type(last_A) == Patches:
        # if last_A is not an identity matrix
        assert last_A.identity == 0
        if last_A.identity == 0:
            # last_A shape: [out_c, batch_size, out_h, out_w, in_c, H, W]. Here out_c is the spec dimension.
            # or (unstable_size, batch_size, in_c, H, W) when it is sparse.
            patches = last_A.patches
            patches_shape = patches.shape
            if len(patches_shape) == 6:
                patches = patches.view(*patches_shape[:2], -1, *patches_shape[-2:])
                if d_pos is not None:
                    d_pos = d_pos.view(*patches_shape[:2], -1, *patches_shape[-2:])
                if d_neg is not None:
                    d_neg = d_neg.view(*patches_shape[:2], -1, *patches_shape[-2:])
                if b_pos is not None:
                    b_pos = b_pos.view(*patches_shape[:2], -1, *patches_shape[-2:])
                if b_neg is not None:
                    b_neg = b_neg.view(*patches_shape[:2], -1, *patches_shape[-2:])
            A_prod, bias = multiply_by_A_signs(patches, d_pos, d_neg, b_pos, b_neg)
            # prod has shape [out_c, batch_size, out_h, out_w, in_c, H, W] or (unstable_size, batch_size, in_c, H, W) when it is sparse.
            # For sparse patches the return bias size is (unstable_size, batch).
            # For regular patches the return bias size is (spec, batch, out_h, out_w).
            if len(patches_shape) == 6:
                A_prod = A_prod.view(*patches_shape)
            # Save the patch size, which will be used in init_slope() to determine the number of optimizable parameters.
            if start_node is not None:
                if last_A.unstable_idx is not None:
                    # Sparse patches, we need to construct the full patch size: (out_c, batch, out_h, out_w, c, h, w).
                    patch_size[start_node.name] = [last_A.output_shape[1], A_prod.size(1), last_A.output_shape[2], last_A.output_shape[3], A_prod.size(-3), A_prod.size(-2), A_prod.size(-1)]
                else:
                    # Regular patches.
                    patch_size[start_node.name] = A_prod.size()
            return Patches(A_prod, last_A.stride, last_A.padding, A_prod.shape, unstable_idx=last_A.unstable_idx, output_shape=last_A.output_shape), bias


# In patches mode, we need to unfold lower and upper slopes. In matrix mode we simply return.
# this is a copy from activation.py
def _maybe_unfold(d_tensor, last_A):
    # d_tensor (out_c, current_c, current_h, current_w): out_c shared the same alpha for spec layer
    if d_tensor is None:
        return None
    # if mode == "matrix" or d_tensor is None or last_A is None:
    if type(last_A) is not Patches or d_tensor is None or last_A is None:
        return d_tensor
    # Input are slopes with shape (spec, batch, input_c, input_h, input_w)
    # Here spec is the same as out_c.
    # assert d_tensor.ndim == 5
    origin_d_shape = d_tensor.shape
    if d_tensor.ndim == 6:
        d_tensor = d_tensor.view(*origin_d_shape[:2], -1, *origin_d_shape[-2:])
    d_shape = d_tensor.size()
    # Reshape to 4-D tensor to unfold.
    d_tensor = d_tensor.view(-1, *d_tensor.shape[-3:])
    # unfold the slope matrix as patches. Patch shape is [spec * batch, out_h, out_w, in_c, H, W).
    d_unfolded = inplace_unfold(d_tensor, kernel_size=last_A.patches.shape[-2:], stride=last_A.stride, padding=last_A.padding)
    # Reshape to (spec, batch, out_h, out_w, in_c, H, W); here spec_size is out_c.
    d_unfolded_r = d_unfolded.view(*d_shape[:-3], *d_unfolded.shape[1:])
    if last_A.unstable_idx is not None:
        if d_unfolded_r.size(0) == 1:
            if len(last_A.unstable_idx) == 3:
                # Broadcast the spec shape, so only need to select the reset dimensions.
                # Change shape to (out_h, out_w, batch, in_c, H, W) or (out_h, out_w, in_c, H, W).
                d_unfolded_r = d_unfolded_r.squeeze(0).permute(1, 2, 0, 3, 4, 5)
                d_unfolded_r = d_unfolded_r[last_A.unstable_idx[1], last_A.unstable_idx[2]]
            elif len(last_A.unstable_idx) == 4:
                # [spec, batch, output_h, output_w, input_c, H, W]
                # to [output_h, output_w, batch, in_c, H, W]
                d_unfolded_r = d_unfolded_r.squeeze(0).permute(1, 2, 0, 3, 4, 5)
                d_unfolded_r = d_unfolded_r[last_A.unstable_idx[2], last_A.unstable_idx[3]]
            else:
                raise NotImplementedError()
            # output shape: (unstable_size, batch, in_c, H, W).
        else:
            d_unfolded_r = d_unfolded_r[last_A.unstable_idx[0], :, last_A.unstable_idx[1], last_A.unstable_idx[2]]
        # For sparse patches, the shape after unfold is (unstable_size, batch_size, in_c, H, W).
    # For regular patches, the shape after unfold is (spec, batch, out_h, out_w, in_c, H, W).
    if d_unfolded_r.ndim != last_A.patches.ndim:
        d_unfolded_r = d_unfolded_r.unsqueeze(2).unsqueeze(-4)
    return d_unfolded_r


================================================
FILE: auto_LiRPA/operators/dropout.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
from .base import *

class BoundDropout(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        if 'ratio' in attr:
            self.ratio = attr['ratio']
            self.dynamic = False
        else:
            self.ratio = None
            self.dynamic = True
        self.clear()

    def clear(self):
        self.mask = None

    def forward(self, *inputs):
        x = inputs[0]
        if not self.training:
            return x
        if self.dynamic:
            # Inputs: data, ratio (optional), training_mode (optional)
            # We assume ratio must exist in the inputs.
            # We ignore training_mode, but will use self.training which can be
            # changed after BoundedModule is built.
            assert (inputs[1].dtype == torch.float32 or
                    inputs[1].dtype == torch.float64)
            self.ratio = inputs[1]
        if self.ratio >= 1:
            raise ValueError('Ratio in dropout should be less than 1')
        self.mask = torch.rand(x.shape, device=self.ratio.device) > self.ratio
        return x * self.mask / (1 - self.ratio)

    def _check_forward(self):
        """ If in the training mode, a forward pass should have been called."""
        if self.training and self.mask is None:
            raise RuntimeError('For a model with dropout in the training mode, '\
                'a clean forward pass must be called before bound computation')

    def bound_backward(self, last_lA, last_uA, *args, **kwargs):
        empty_A = [(None, None)] * (len(args) -1)
        if not self.training:
            return [(last_lA, last_uA), *empty_A], 0, 0
        self._check_forward()
        def _bound_oneside(last_A):
            if last_A is None:
                return None
            return last_A * self.mask / (1 - self.ratio)
        lA = _bound_oneside(last_lA)
        uA = _bound_oneside(last_uA)
        return [(lA, uA), *empty_A], 0, 0

    def bound_forward(self, dim_in, x, *args):
        if not self.training:
            return x
        self._check_forward()
        lw = x.lw * self.mask.unsqueeze(1) / (1 - self.ratio)
        lb = x.lb * self.mask / (1 - self.ratio)
        uw = x.uw * self.mask.unsqueeze(1) / (1 - self.ratio)
        ub = x.ub * self.mask / (1 - self.ratio)
        return LinearBound(lw, lb, uw, ub)

    def interval_propagate(self, *v):
        if not self.training:
            return v[0]
        self._check_forward()
        h_L, h_U = v[0]
        lower = h_L * self.mask / (1 - self.ratio)
        upper = h_U * self.mask / (1 - self.ratio)
        return lower, upper


================================================
FILE: auto_LiRPA/operators/dtype.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
from .base import *
from ..utils import Patches

class BoundCast(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.to = attr['to']
        # See values of enum DataType in TensorProto.
        # Unsupported: str, uint16, uint32, uint64.
        self.data_types = [
            None,  torch.float, torch.uint8, torch.int8,
            None,  torch.int16, torch.int32, torch.int64,
            None,  torch.bool, torch.float16, torch.float64,
            None,  None, torch.complex64, torch.complex128
        ]
        self.type = self.data_types[self.to]
        assert self.type is not None, "Unsupported type conversion."
        self.use_default_ibp = True

    def forward(self, x):
        self.type_in = x.dtype
        return x.to(self.type)

    def bound_backward(self, last_lA, last_uA, x, **kwargs):
        if type(last_lA) == Tensor or type(last_uA) == Tensor:
            lA = last_lA.to(self.type_in) if last_lA is not None else None
            uA = last_uA.to(self.type_in) if last_uA is not None else None
        else:
            if last_lA is not None:
                lA = Patches(last_lA.patches.to(self.type_in), last_lA.stride, last_lA.padding, last_lA.shape, last_lA.identity, last_lA.unstable_idx, last_lA.output_shape)
            if last_uA is not None:
                uA = Patches(last_uA.patches.to(self.type_in), last_uA.stride, last_uA.padding, last_uA.shape, last_uA.identity, last_uA.unstable_idx, last_uA.output_shape)
        return [(lA, uA)], 0, 0

    def bound_forward(self, dim_in, x):
        return LinearBound(
            x.lw.to(self.type), x.lb.to(self.type),
            x.uw.to(self.type), x.ub.to(self.type))

    def build_solver(self, *v, model, C=None, model_type="mip", solver_pkg="gurobi"):
        self.solver_vars = self.forward(v[0])


================================================
FILE: auto_LiRPA/operators/gelu.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from .s_shaped import BoundTanh
from .base import logger


# FIXME resolve duplicate code with BoundTanh
class BoundGelu(BoundTanh):
    sqrt_2 = math.sqrt(2)

    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options, precompute=False)
        self.ibp_intermediate = False
        self.act_func = F.gelu
        def d_act_func(x):
            return (0.5 * (1 + torch.erf(x / self.sqrt_2))
                    + x * torch.exp(-0.5 * x ** 2) / math.sqrt(2 * torch.pi))
        self.d_act_func = d_act_func
        def d2_act_func(x):
            return (2 * torch.exp(-0.5 * x ** 2) / math.sqrt(2 * torch.pi)
                    - x ** 2 * torch.exp(-0.5 * x ** 2) / math.sqrt(2 * torch.pi))
        self.d2_act_func = d2_act_func
        self.precompute_relaxation(self.act_func, self.d_act_func)

    def _init_masks(self, x):
        lower = x.lower
        upper = x.upper
        self.mask_left_pos = torch.logical_and(lower >= -self.sqrt_2, upper <= 0)
        self.mask_left_neg = upper <= -self.sqrt_2
        self.mask_left = torch.logical_xor(upper <= 0,
                torch.logical_or(self.mask_left_pos, self.mask_left_neg))

        self.mask_right_pos = lower >= self.sqrt_2
        self.mask_right_neg = torch.logical_and(upper <= self.sqrt_2, lower >= 0)
        self.mask_right = torch.logical_xor(lower >= 0,
                torch.logical_or(self.mask_right_pos, self.mask_right_neg))

        self.mask_2 = torch.logical_and(torch.logical_and(upper > 0, upper <= self.sqrt_2),
                    torch.logical_and(lower < 0, lower >= -self.sqrt_2))
        self.mask_left_3 = torch.logical_and(lower < -self.sqrt_2, torch.logical_and(
            upper > 0, upper <= self.sqrt_2))
        self.mask_right_3 = torch.logical_and(upper > self.sqrt_2, torch.logical_and(
            lower < 0, lower >= -self.sqrt_2))
        self.mask_4 = torch.logical_and(lower < -self.sqrt_2, upper > self.sqrt_2)
        self.mask_both = torch.logical_or(self.mask_2, torch.logical_or(self.mask_4,
                    torch.logical_or(self.mask_left_3, self.mask_right_3)))

    @torch.no_grad()
    def precompute_relaxation(self, func, dfunc, x_limit=1000):
        """
        This function precomputes the tangent lines that will be used as
        lower/upper bounds for S-shapes functions.
        """
        self.x_limit = x_limit
        self.step_pre = 0.01
        self.num_points_pre = int(self.x_limit / self.step_pre)
        max_iter = 100

        logger.debug('Precomputing relaxation for GeLU (pre-activation limit: %f)',
                     x_limit)

        def check_lower(upper, d):
            """Given two points upper, d (d <= upper), check if the slope at d
            will be less than f(upper) at upper."""
            k = dfunc(d)
            # Return True if the slope is a lower bound.
            return k * (upper - d) + func(d) <= func(upper)

        def check_upper(lower, d):
            """Given two points lower, d (d >= lower), check if the slope at d
            will be greater than f(lower) at lower."""
            k = dfunc(d)
            # Return True if the slope is a upper bound.
            return k * (lower - d) + func(d) >= func(lower)

        # Given an upper bound point (>=0), find a line that is guaranteed to
        # be a lower bound of this function.
        upper = self.step_pre * torch.arange(
            0, self.num_points_pre + 5, device=self.device) + self.sqrt_2
        r = torch.ones_like(upper)
        # Initial guess, the tangent line is at -1.
        l = -torch.ones_like(upper)
        while True:
            # Check if the tangent line at the guessed point is an lower bound at f(upper).
            checked = check_lower(upper, l).int()
            # If the initial guess is not smaller enough, then double it (-2, -4, etc).
            l = checked * l + (1 - checked) * (l * 2)
            if checked.sum() == l.numel():
                break
        # Now we have starting point at l, its tangent line is guaranteed to
        # be an lower bound at f(upper).
        # We want to further tighten this bound by moving it closer to 0.
        for _ in range(max_iter):
            # Binary search.
            m = (l + r) / 2
            checked = check_lower(upper, m).int()
            l = checked * m + (1 - checked) * l
            r = checked * r + (1 - checked) * m
        # At upper, a line with slope l is guaranteed to lower bound the function.
        self.d_lower_right = l.clone()

        # Do the same again:
        # Given an lower bound point (<=0), find a line that is guaranteed to
        # be an upper bound of this function.
        lower = (
            -self.step_pre * torch.arange(
                0, self.num_points_pre + 5, device=self.device
            ) + self.sqrt_2).clamp(min=0.01)
        l = torch.zeros_like(upper) + self.sqrt_2
        r = torch.zeros_like(upper) + x_limit
        while True:
            checked = check_upper(lower, r).int()
            r = checked * r + (1 - checked) * (r * 2)
            if checked.sum() == l.numel():
                break
        for _ in range(max_iter):
            m = (l + r) / 2
            checked = check_upper(lower, m).int()
            l = (1 - checked) * m + checked * l
            r = (1 - checked) * r + checked * m
        self.d_upper_right = r.clone()

        upper = -self.step_pre * torch.arange(
            0, self.num_points_pre + 5, device=self.device) - self.sqrt_2
        r = torch.zeros_like(upper) - 0.7517916
        # Initial guess, the tangent line is at -1.
        l = torch.zeros_like(upper) - self.sqrt_2
        while True:
            checked = check_lower(upper, r).int()
            r = checked * r + (1 - checked) * (r * 2)
            if checked.sum() == l.numel():
                break
        # Now we have starting point at l, its tangent line is guaranteed to be
        # an lower bound at f(upper).
        # We want to further tighten this bound by moving it closer to 0.
        for _ in range(max_iter):
            # Binary search.
            m = (l + r) / 2
            checked = check_lower(upper, m).int()
            l = (1 - checked) * m + checked * l
            r = (1 - checked) * r + checked * m
        # At upper, a line with slope l is guaranteed to lower bound the function.
        self.d_lower_left = r.clone()

        # Do the same again:
        # Given an lower bound point (<=0), find a line that is guaranteed to
        # be an upper bound of this function.
        lower = (
            self.step_pre * torch.arange(
                0, self.num_points_pre + 5, device=self.device
            ) - self.sqrt_2).clamp(max=0)
        l = torch.zeros_like(upper) - x_limit
        r = torch.zeros_like(upper) - self.sqrt_2
        while True:
            checked = check_upper(lower, l).int()
            l = checked * l + (1 - checked) * (l * 2)
            if checked.sum() == l.numel():
                break
        for _ in range(max_iter):
            m = (l + r) / 2
            checked = check_upper(lower, m).int()
            l = (1 - checked) * m + checked * l
            r = (1 - checked) * r + checked * m
        self.d_upper_left = r.clone()

        logger.debug('Done')

    def opt_init(self):
        super().opt_init()
        self.tp_right_lower_init = {}
        self.tp_right_upper_init = {}
        self.tp_left_lower_init = {}
        self.tp_left_upper_init = {}
        self.tp_both_lower_init = {}

    def _init_opt_parameters_impl(self, size_spec, name_start):
        """Implementation of init_opt_parameters for each start_node."""
        l, u = self.inputs[0].lower, self.inputs[0].upper
        shape = [size_spec] + list(l.shape)
        alpha = torch.empty(14, *shape, device=l.device)
        alpha.data[:4] = ((l + u) / 2).unsqueeze(0).expand(4, *shape)
        alpha.data[4:6] = self.tp_right_lower_init[name_start].expand(2, *shape)
        alpha.data[6:8] = self.tp_right_upper_init[name_start].expand(2, *shape)
        alpha.data[8:10] = self.tp_left_lower_init[name_start].expand(2, *shape)
        alpha.data[10:12] = self.tp_left_upper_init[name_start].expand(2, *shape)
        alpha.data[12:14] = self.tp_both_lower_init[name_start].expand(2, *shape)
        return alpha

    def forward(self, x):
        return F.gelu(x)

    def bound_relax_impl(self, x, func, dfunc):
        lower, upper = x.lower, x.upper
        y_l, y_u = func(lower), func(upper)
        # k_direct is the slope of the line directly connect
        # (lower, func(lower)), (upper, func(upper)).
        k_direct = k = (y_u - y_l) / (upper - lower).clamp(min=1e-8)

        # Fixed bounds that cannot be optimized. self.mask_neg are the masks
        # for neurons with upper bound <= 0.
        # Upper bound for the case of input lower bound <= 0, is always the direct line.
        self.add_linear_relaxation(
            mask=torch.logical_or(
                torch.logical_or(self.mask_left_pos, self.mask_right_neg),
                self.mask_both
            ), type='upper', k=k_direct, x0=lower, y0=y_l)
        # Lower bound for the case of input upper bound >= 0, is always the direct line.
        self.add_linear_relaxation(
            mask=torch.logical_or(self.mask_left_neg,
                    self.mask_right_pos), type='lower', k=k_direct, x0=lower, y0=y_l)

        # Indices of neurons with input upper bound >= sqrt(2),
        # whose optimal slope to lower bound on the right side was pre-computed.
        d_lower_right = self.retrieve_from_precompute(
            self.d_lower_right, upper - self.sqrt_2, lower)

        # Indices of neurons with input lower bound <= -sqrt(2),
        # whose optimal slope to lower bound on the left side was pre-computed.
        d_lower_left = self.retrieve_from_precompute(
            self.d_lower_left, -lower - self.sqrt_2, upper)

        # Indices of neurons with input lower bound <= sqrt(2),
        # whose optimal slope to upper bound on the right side was pre-computed.
        d_upper_right = self.retrieve_from_precompute(
            self.d_upper_right, -lower + self.sqrt_2, upper)

        # Indices of neurons with input lower bound <= sqrt(2),
        # whose optimal slope to upper bound on the right side was pre-computed.
        d_upper_left = self.retrieve_from_precompute(
            self.d_upper_left, -lower - self.sqrt_2, upper)

        if self.opt_stage in ['opt', 'reuse']:
            if not hasattr(self, 'alpha'):
                # Raise an error if alpha is not created.
                self._no_bound_parameters()
            ns = self._start

            # Clipping is done here rather than after `opt.step()` call
            # because it depends on pre-activation bounds
            self.alpha[ns].data[0:2] = torch.max(
                torch.min(self.alpha[ns][0:2], upper), lower)
            self.alpha[ns].data[2:4] = torch.max(
                torch.min(self.alpha[ns][2:4], upper), lower)
            self.alpha[ns].data[4:6] = torch.max(
                torch.min(self.alpha[ns][4:6], d_lower_right), lower)
            self.alpha[ns].data[6:8] = torch.max(
                self.alpha[ns][6:8], d_upper_right)
            self.alpha[ns].data[8:10] = torch.min(
                torch.max(self.alpha[ns][8:10], d_lower_left), upper)
            self.alpha[ns].data[10:12] = torch.min(
                self.alpha[ns][10:12], d_upper_left)
            self.alpha[ns].data[12:14] = torch.min(
                torch.max(self.alpha[ns][12:14], d_lower_left), d_lower_right)

            # shape [2, out_c, n, c, h, w].
            tp_pos = self.alpha[ns][0:2]  # For upper bound relaxation
            tp_neg = self.alpha[ns][2:4]  # For lower bound relaxation
            tp_right_lower = self.alpha[ns][4:6]
            tp_right_upper = self.alpha[ns][6:8]
            tp_left_lower = self.alpha[ns][8:10]
            tp_left_upper = self.alpha[ns][10:12]
            tp_both_lower = self.alpha[ns][12:14]

            # No need to use tangent line, when the tangent point is at the left
            # side of the preactivation lower bound. Simply connect the two sides.
            mask_direct = torch.logical_and(self.mask_right, k_direct < dfunc(lower))
            self.add_linear_relaxation(
                mask=mask_direct, type='lower', k=k_direct, x0=lower, y0=y_l)
            self.add_linear_relaxation(
                mask=torch.logical_or(self.mask_right_3,
                    torch.logical_xor(self.mask_right, mask_direct)), type='lower',
                k=dfunc(tp_right_lower), x0=tp_right_lower)
            mask_direct = torch.logical_and(self.mask_left, k_direct > dfunc(upper))
            self.add_linear_relaxation(
                mask=mask_direct, type='lower', k=k_direct, x0=lower, y0=y_l)
            self.add_linear_relaxation(
                mask=torch.logical_or(self.mask_left_3,
                    torch.logical_xor(self.mask_left, mask_direct)), type='lower',
                k=dfunc(tp_left_lower), x0=tp_left_lower)

            mask_direct = torch.logical_and(self.mask_right, k_direct < dfunc(upper))
            self.add_linear_relaxation(
                mask=mask_direct, type='upper', k=k_direct, x0=lower, y0=y_l)
            self.add_linear_relaxation(
                mask=torch.logical_xor(self.mask_right, mask_direct), type='upper',
                k=dfunc(tp_right_upper), x0=tp_right_upper)
            mask_direct = torch.logical_and(self.mask_left, k_direct > dfunc(lower))
            self.add_linear_relaxation(
                mask=mask_direct, type='upper', k=k_direct, x0=lower, y0=y_l)
            self.add_linear_relaxation(
                mask=torch.logical_xor(self.mask_left, mask_direct), type='upper',
                k=dfunc(tp_left_upper), x0=tp_left_upper)

            self.add_linear_relaxation(
                mask=self.mask_4, type='lower', k=dfunc(tp_both_lower), x0=tp_both_lower)
            self.add_linear_relaxation(
                mask=torch.logical_or(torch.logical_or(self.mask_left_pos, self.mask_right_neg),
                    self.mask_2), type='lower', k=dfunc(tp_neg), x0=tp_neg)
            self.add_linear_relaxation(
                mask=torch.logical_or(self.mask_right_pos,
                    self.mask_left_neg), type='upper', k=dfunc(tp_pos), x0=tp_pos)
        else:
            if self.opt_stage == 'init':
                # Initialize optimizable slope.
                tp_right_lower_init = d_lower_right.detach()
                tp_right_upper_init = d_upper_right.detach()
                tp_left_lower_init = d_lower_left.detach()
                tp_left_upper_init = d_upper_left.detach()
                tp_both_lower_init = d_lower_right.detach()

                ns = self._start
                self.tp_right_lower_init[ns] = tp_right_lower_init
                self.tp_right_upper_init[ns] = tp_right_upper_init
                self.tp_left_lower_init[ns] = tp_left_lower_init
                self.tp_left_upper_init[ns] = tp_left_upper_init
                self.tp_both_lower_init[ns] = tp_both_lower_init

            # Not optimized (vanilla CROWN bound).
            # Use the middle point slope as the lower/upper bound. Not optimized.
            m = (lower + upper) / 2
            y_m = func(m)
            k = dfunc(m)
            # Lower bound is the middle point slope for the case input upper bound <= 0.
            # Note that the upper bound in this case is the direct line between
            # (lower, func(lower)) and (upper, func(upper)).
            self.add_linear_relaxation(
                mask=torch.logical_or(
                    torch.logical_or(self.mask_left_pos, self.mask_right_neg),
                    self.mask_2
                ), type='lower', k=k, x0=m, y0=y_m)
            # Upper bound is the middle point slope for the case input lower bound >= 0.
            # Note that the lower bound in this case is the direct line between
            # (lower, func(lower)) and (upper, func(upper)).
            self.add_linear_relaxation(mask=torch.logical_or(self.mask_right_pos,
                    self.mask_left_neg), type='upper', k=k, x0=m, y0=y_m)

            # Now handle the case where input lower bound <=0 and upper bound >= 0.
            # A tangent line starting at d_lower is guaranteed to be a lower bound
            # given the input upper bound.
            mask_direct = torch.logical_and(self.mask_right, k_direct < dfunc(lower))
            self.add_linear_relaxation(mask=mask_direct, type='lower', k=k_direct, x0=lower, y0=y_l)
            # Otherwise we do not use the direct line, we use the d_lower slope.
            self.add_linear_relaxation(
                mask=torch.logical_or(torch.logical_or(self.mask_right_3, self.mask_4),
                    torch.logical_xor(self.mask_right, mask_direct)), type='lower',
                k=dfunc(d_lower_right), x0=d_lower_right)
            mask_direct = torch.logical_and(self.mask_left, k_direct > dfunc(upper))
            self.add_linear_relaxation(mask=mask_direct, type='lower', k=k_direct, x0=lower, y0=y_l)
            self.add_linear_relaxation(
                mask=torch.logical_or(self.mask_left_3,
                    torch.logical_xor(self.mask_left, mask_direct)), type='lower',
                k=dfunc(d_lower_left), x0=d_lower_left)

            mask_direct = torch.logical_and(self.mask_right, k_direct < dfunc(upper))
            self.add_linear_relaxation(
                mask=mask_direct, type='upper', k=k_direct, x0=lower, y0=y_l)
            self.add_linear_relaxation(
                mask=torch.logical_xor(self.mask_right, mask_direct), type='upper',
                k=dfunc(d_upper_right), x0=d_upper_right)
            mask_direct = torch.logical_and(self.mask_left, k_direct > dfunc(lower))
            self.add_linear_relaxation(
                mask=mask_direct, type='upper', k=k_direct, x0=lower, y0=y_l)
            self.add_linear_relaxation(
                mask=torch.logical_xor(self.mask_left, mask_direct), type='upper',
                k=dfunc(d_upper_left), x0=d_upper_left)

    def bound_relax(self, x, init=False, dim_opt=None):
        if init:
            self.init_linear_relaxation(x, dim_opt)
        self.bound_relax_impl(x, self.act_func, self.d_act_func)

    def interval_propagate(self, *v):
        pl, pu = self.forward(v[0][0]), self.forward(v[0][1])
        pl, pu = torch.min(pl, pu), torch.max(pl, pu)
        min_global = self.forward(torch.tensor(-0.7517916))
        pl, pu = torch.min(min_global, torch.min(pl, pu)), torch.max(pl, pu)
        return pl, pu


class GELUOp(torch.autograd.Function):
    sqrt_2 = math.sqrt(2)
    sqrt_2pi = math.sqrt(2 * math.pi)

    @staticmethod
    def symbolic(g, x):
        return g.op('custom::Gelu', x)

    @staticmethod
    def forward(ctx, x):
        ctx.save_for_backward(x)
        return torch.nn.functional.gelu(x)

    @staticmethod
    def backward(ctx, grad_output):
        x, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad = (0.5 * (1 + torch.erf(x / GELUOp.sqrt_2))
                + x * torch.exp(-0.5 * x ** 2) / GELUOp.sqrt_2pi)
        return grad_input * grad


class GELU(nn.Module):
    def forward(self, x):
        return GELUOp.apply(x)


================================================
FILE: auto_LiRPA/operators/indexing.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
from .base import *
from ..patches import Patches, patches_to_matrix
from torch.nn import Module


class BoundGather(Bound):
    def __init__(self, attr, x, output_index, options):
        super().__init__(attr, x, output_index, options)
        self.axis = attr['axis'] if 'axis' in attr else 0

    def forward(self, x, indices):
        self.indices = indices
        if self.axis == -1:
            self.axis = len(x.shape) - 1
        # BoundShape.shape() will return values on cpu only
        x = x.to(self.indices.device)
        if indices.ndim == 0:
            if indices == -1:
                self.indices = x.shape[self.axis] + indices
            return torch.index_select(x, dim=self.axis, index=self.indices).squeeze(self.axis)
        elif indices.ndim == 1:
            if self.axis == 0:
                assert not self.perturbed
            # `index_select` requires `indices` to be a 1-D tensor
            return torch.index_select(x, dim=self.axis, index=indices)

        raise ValueError('Unsupported shapes in Gather: '
                         f'data {x.shape}, indices {indices.shape}, '
                         f'axis {self.axis}')

    def bound_backward(self, last_lA, last_uA, *args, **kwargs):
        assert self.from_input

        def _expand_A_with_zeros(A, axis, idx, max_axis_size):
            # Need to recreate A with three parts: before the gathered element, gathered element, and after gathered element.
            tensors = []
            if idx < 0:
                idx = max_axis_size + idx
            if idx > 0:
                shape_pre = list(A.shape)
                shape_pre[axis] *= idx
                # Create the same shape as A, except for the dimension to be gathered.
                tensors.append(torch.zeros(shape_pre, device=A.device))
            # The gathered element itself, in the middle.
            tensors.append(A)
            if max_axis_size - idx - 1 > 0:
                shape_next = list(A.shape)
                shape_next[axis] *= max_axis_size - idx - 1
                # Create the rest part of A.
                tensors.append(torch.zeros(shape_next, device=A.device))
            # Concatenate all three parts together.
            return torch.cat(tensors, dim=axis)

        def _bound_oneside(A):
            if A is None:
                return None

            if isinstance(A, torch.Tensor):
                if self.indices.ndim == 0:
                    A = A.unsqueeze(self.axis + 1)
                    idx = int(self.indices)
                    return _expand_A_with_zeros(A, self.axis + 1, idx, self.input_shape[self.axis])
                else:
                    shape = list(A.shape)
                    final_A = torch.zeros(*shape[:self.axis + 1], self.input_shape[self.axis], *shape[self.axis + 2:], device=A.device)
                    idx = self.indices.view([*[1]*(self.axis+1), -1, *[1]*len(shape[self.axis + 2:])])
                    idx = idx.repeat([*A.shape[:self.axis+1], 1, *A.shape[self.axis+2:]])
                    final_A.scatter_add_(dim=self.axis+1, index=idx, src=A)
                    return final_A
            elif isinstance(A, Patches):
                if self.indices.ndim == 0:
                    idx = int(self.indices)
                    assert len(self.input_shape) == 4 and self.axis == 1, "Gather is only supported on the channel dimension for Patches mode."
                    # For gather in the channel dimension, we only need to deal with the in_c dimension (-3) in patches.
                    patches = A.patches
                    # -3 is the in_c dimension.
                    new_patches = _expand_A_with_zeros(patches, axis=-3, idx=idx, max_axis_size=self.input_shape[self.axis])
                    return A.create_similar(new_patches)
                else:
                    raise NotImplementedError
            else:
                raise ValueError(f'Unknown last_A type {type(A)}')

        return [(_bound_oneside(last_lA), _bound_oneside(last_uA)), (None, None)], 0, 0

    def bound_forward(self, dim_in, x, indices):
        assert self.indices.numel() == 1 and self.indices.ndim <= 1 and (self.indices >= 0).all()
        if isinstance(x, torch.Size):
            lw = uw = torch.zeros(dim_in, device=self.device)
            lb = ub = torch.index_select(
                torch.tensor(x, device=self.device),
                dim=self.axis, index=self.indices).squeeze(self.axis)
        else:
            axis = self.axis + 1
            lw = torch.index_select(x.lw, dim=self.axis + 1, index=self.indices)
            uw = torch.index_select(x.uw, dim=self.axis + 1, index=self.indices)
            lb = torch.index_select(x.lb, dim=self.axis, index=self.indices)
            ub = torch.index_select(x.ub, dim=self.axis, index=self.indices)
            if self.indices.ndim == 0:
                lw = lw.squeeze(axis)
                uw = uw.squeeze(axis)
                lb = lb.squeeze(self.axis)
                ub = ub.squeeze(self.axis)
        return LinearBound(lw, lb, uw, ub)

    def interval_propagate(self, *v):
        assert not self.is_input_perturbed(1)
        return self.forward(v[0][0], v[1][0]), self.forward(v[0][1], v[1][0])

    def build_solver(self, *v, model, C=None, model_type="mip", solver_pkg="gurobi"):
        self.solver_vars = self.forward(v[0], v[1])

    def build_gradient_node(self, grad_upstream):
        return [(GatherGrad(self.axis, self.indices, self.input_shape), (grad_upstream,), []), None]


class GatherGrad(Module):
    def __init__(self, axis, indices, input_shape):
        super().__init__()
        self.axis = axis
        self.indices = indices
        self.input_shape = input_shape
    
    def forward(self, grad_last):
        # TODO: It's better to use scatter_add_ instead of cat.
        # This is a workaround for the fact that scatter_add_ does not support negative indices.

        # Scalar indices case (ndim == 0)
        if self.indices.ndim == 0:
            grad_unsq = grad_last.unsqueeze(self.axis)
            
            # Get the scalar index and adjust if negative.
            idx = int(self.indices)
            if idx < 0:
                idx = self.input_shape[self.axis] + idx
            
            # Build the gradient by concatenating three parts along self.axis:
            tensors = []
            # 1. Zeros block before the gathered element (if idx > 0)
            if idx > 0:
                shape_pre = list(grad_unsq.shape)
                shape_pre[self.axis] = idx  # pre-block has size idx along self.axis
                zeros_pre = torch.zeros(shape_pre, dtype=grad_last.dtype, device=grad_last.device)
                tensors.append(zeros_pre)
            
            # 2. The gathered gradient slice (already in grad_unsq)
            tensors.append(grad_unsq)
            
            # 3. Zeros block after the gathered element
            num_after = self.input_shape[self.axis] - idx - 1
            if num_after > 0:
                shape_post = list(grad_unsq.shape)
                shape_post[self.axis] = num_after
                zeros_post = torch.zeros(shape_post, dtype=grad_last.dtype, device=grad_last.device)
                tensors.append(zeros_post)
            
            # Concatenate all parts along self.axis to form the full gradient tensor.
            grad_input = torch.cat(tensors, dim=self.axis)
            return grad_input

        # 1-D indices case (ndim == 1)
        elif self.indices.ndim == 1:
            grad_slices = []
            # Iterate over each position in the original input along self.axis.
            for i in range(self.input_shape[self.axis]):
                # matching: tensor of indices (in grad_last) where the gathered index equals i.
                matching = (self.indices == i).nonzero(as_tuple=False).squeeze(-1)
                
                if matching.numel() == 0:
                    # No matching index: create a zeros slice with the same shape as one slice of grad_last.
                    slice_shape = list(grad_last.shape)
                    slice_shape[self.axis] = 1  # single slice along self.axis
                    grad_slice = torch.zeros(slice_shape, dtype=grad_last.dtype, device=grad_last.device)
                else:
                    # There are one or more matching positions.
                    # For each matching index j, extract the corresponding slice from grad_last.
                    slice_list = []
                    for j in matching.tolist():
                        # Build slicing object：select all elements, but at self.axis take index j.
                        slicer = [slice(None)] * grad_last.dim()
                        slicer[self.axis] = j
                        # Extract the slice and add back the missing dimension.
                        slice_j = grad_last[tuple(slicer)].unsqueeze(self.axis)
                        slice_list.append(slice_j)
                    # Concatenate all slices along self.axis; if there are duplicates, sum them.
                    cat_slices = torch.cat(slice_list, dim=self.axis)
                    # Sum along self.axis to accumulate contributions from duplicate indices.
                    grad_slice = cat_slices.sum(dim=self.axis, keepdim=True)
                # Append the slice corresponding to position i.
                grad_slices.append(grad_slice)
            
            # Concatenate all slices in order along self.axis to form the final gradient tensor.
            grad_input = torch.cat(grad_slices, dim=self.axis)
            return grad_input

        else:
            raise ValueError("Unsupported indices dimensions in gradient for Gather")


class BoundGatherElements(Bound):
    def __init__(self, attr, input, output_index, options):
        super().__init__(attr, input, output_index, options)
        self.axis = attr['axis']

    def forward(self, x, index):
        self.index = index
        return torch.gather(x, dim=self.axis, index=index)

    def bound_backward(self, last_lA, last_uA, x, index, **kwargs):
        assert self.from_input

        dim = self._get_dim()

        def _bound_oneside(last_A):
            if last_A is None:
                return None
            A = torch.zeros(
                last_A.shape[0], last_A.shape[1], *x.output_shape[1:], device=last_A.device)
            A.scatter_(
                dim=dim + 1,
                index=self.index.unsqueeze(0).repeat(A.shape[0], *([1] * (A.ndim - 1))),
                src=last_A)
            return A

        return [(_bound_oneside(last_lA), _bound_oneside(last_uA)), (None, None)], 0, 0

    def interval_propagate(self, *v):
        assert not self.is_input_perturbed(1)
        return self.forward(v[0][0], v[1][0]), \
               self.forward(v[0][1], v[1][1])

    def bound_forward(self, dim_in, x, index):
        assert self.axis != 0
        dim = self._get_dim()
        return LinearBound(
            torch.gather(x.lw, dim=dim + 1, index=self.index.unsqueeze(1).repeat(1, dim_in, 1)),
            torch.gather(x.lb, dim=dim, index=self.index),
            torch.gather(x.uw, dim=dim + 1, index=self.index.unsqueeze(1).repeat(1, dim_in, 1)),
            torch.gather(x.ub, dim=dim, index=self.index))

    def _get_dim(self):
        dim = self.axis
        if dim < 0:
            dim = len(self.output_shape) + dim
        return dim


================================================
FILE: auto_LiRPA/operators/jacobian.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
import torch
from torch.nn import Module
from .base import Bound
from ..utils import prod


class JacobianOP(torch.autograd.Function):
    @staticmethod
    def symbolic(g, output, input):
        return g.op('grad::jacobian', output, input).setType(output.type())

    @staticmethod
    def forward(ctx, output, input):
        output_ = output.flatten(1)
        return torch.zeros(
            output.shape[0], output_.shape[-1], *input.shape[1:],
            device=output.device)


class BoundJacobianOP(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)

    def forward(self, output, input):
        return JacobianOP.apply(output, input)


class BoundJacobianInit(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.never_perturbed = True

    def forward(self, x):
        dim = prod(x.shape[1:])
        eye = torch.eye(dim, device=x.device, requires_grad=x.requires_grad)
        eye = eye.unsqueeze(0).expand(
            x.shape[0], -1, -1
        ).view(x.shape[0], dim, *x.shape[1:])
        return eye


class GradNorm(Module):
    def __init__(self, norm=1):
        super().__init__()
        self.norm = norm

    def forward(self, grad):
        grad = grad.view(grad.size(0), -1)
        if self.norm == 1:
            # torch.norm is not supported in auto_LiRPA yet
            # use simpler operators for now
            return grad.abs().sum(dim=-1, keepdim=True)
        elif self.norm == 2:
            return (grad * grad).sum(dim=-1)
        else:
            raise NotImplementedError(self.norm)


================================================
FILE: auto_LiRPA/operators/leaf.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
""" Leaf nodes (indepedent nodes in the auto_LiRPA paper).

Including input, parameter, buffer, etc."""

from itertools import chain
from .base import *


class BoundInput(Bound):
    def __init__(self, ori_name, value, perturbation=None, input_index=None, options=None, attr=None):
        super().__init__(options=options, attr=attr)
        self.ori_name = ori_name
        self.value = value
        self.perturbation = perturbation
        self.from_input = True
        self.input_index = input_index
        self.no_jacobian = True

    def __setattr__(self, key, value):
        super().__setattr__(key, value)
        # Update perturbed property based on the perturbation set.
        if key == "perturbation":
            if self.perturbation is not None:
                self.perturbed = True
            else:
                self.perturbed = False

    def forward(self):
        return self.value

    def bound_forward(self, dim_in):
        assert 0

    def bound_backward(self, last_lA, last_uA, **kwargs):
        raise ValueError('{} is a BoundInput node and should not be visited here'.format(
            self.name))

    def interval_propagate(self, *v):
        raise ValueError('{} is a BoundInput node and should not be visited here'.format(
            self.name))

class BoundParams(BoundInput):
    def __init__(self, ori_name, value, perturbation=None, options=None, attr=None):
        super().__init__(ori_name, None, perturbation, attr=attr)
        self.register_parameter('param', value)
        if options is None:
            options = {}
        self.auto_requires_grad = options.get("param", {}).get("auto_requires_grad", True)
        self.from_input = False

    def register_parameter(self, name, param):
        """Override register_parameter() hook to register only needed parameters."""
        if name == 'param':
            return super().register_parameter(name, param)
        else:
            # Just register it as a normal property of class.
            object.__setattr__(self, name, param)

    def init(self, initializing=False):
        self.initializing = initializing

    def forward(self):
        param = self.param
        if self.auto_requires_grad:
            param = param.requires_grad_(self.training)
        return param

class BoundBuffers(BoundInput):
    def __init__(self, ori_name, value, perturbation=None, options=None, attr=None):
        super().__init__(ori_name, None, perturbation, attr=attr)
        self.register_buffer('buffer', value.clone().detach())
        # BoundBuffers are like constants and they are by default not from inputs.
        # The "has_batchdim" was a hack that will forcibly set BoundBuffer to be
        # from inputs, to workaround buffers with a batch size dimension. This is
        # not needed in most cases now.
        if 'buffers' in options and 'has_batchdim' in options['buffers']:
            warnings.warn('The "has_batchdim" option for BoundBuffers is deprecated.'
                          ' It may be removed from the next release.')
        self.from_input = options.get('buffers', {}).get('has_batchdim', False)

    def forward(self):
        return self.buffer


================================================
FILE: auto_LiRPA/operators/linear.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
""" Linear (possibly with weight perturbation) or Dot product layers """
from torch import Tensor
from torch.nn import Module
from typing import Tuple, List
from .activation_base import BoundOptimizableActivation
from .base import *
from .bivariate import BoundMul, MulHelper
from .leaf import BoundParams, BoundBuffers
from ..patches import Patches, inplace_unfold
from .solver_utils import grb
from .clampmult import multiply_by_A_signs

EPS = 1e-2

class BoundLinear(BoundOptimizableActivation):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        # Gemm:
        # A = A if transA == 0 else A.T
        # B = B if transB == 0 else B.T
        # C = C if C is not None else np.array(0)
        # Y = alpha * np.dot(A, B) + beta * C
        # return Y

        super().__init__(attr, inputs, output_index, options)

        # Defaults in ONNX
        self.transA = 0
        self.transB = 0
        self.alpha_linear = 1.0
        self.beta_linear = 1.0
        if attr is not None:
            self.transA = attr['transA'] if 'transA' in attr else self.transA
            self.transB = attr['transB'] if 'transB' in attr else self.transB
            self.alpha_linear = attr['alpha'] if 'alpha' in attr else self.alpha_linear
            self.beta_linear = attr['beta'] if 'beta' in attr else self.beta_linear

        options = options or {}
        self.opt_matmul = options.get('matmul')
        self.splittable = False

        self.mul_helper = MulHelper()
        self.use_seperate_weights_for_lower_and_upper_bounds = False
        self.batched_weight_and_bias = False
        self.share_alphas = options.get('matmul', {}).get('share_alphas', False)
        self.mul_middle = options.get('mul', {}).get('middle', False)
        # For MatMul, it's possible that only the second input is perturbed.
        # In this case, we swap the roles of x and weight.
        self.swap_x_and_weight = False

    def _preprocess(self, a, b, c=None):
        """Handle tranpose and linear coefficients."""
        if self.transA and isinstance(a, Tensor):
            a = a.transpose(-2,-1)
        if self.alpha_linear != 1.0:
            a = self.alpha_linear * a
        if not self.transB and isinstance(b, Tensor):
            # our code assumes B is transposed (common case), so we transpose B
            # only when it is not transposed in gemm.
            b = b.transpose(-2, -1)
        if c is not None:
            if self.beta_linear != 1.0:
                c = self.beta_linear * c
        return a, b, c

    def init_opt_parameters(self, start_nodes):
        shared_alpha_dims = []
        if self.share_alphas:
            # TODO Temporarily an adhoc check for alpha sharing.
            count_matmul = len([item for item in self._all_optimizable_activations
                                if isinstance(item, BoundLinear)])
            if count_matmul >= 6:
                shared_alpha_dims = [1, 2, 3]
            elif count_matmul >= 4:
                shared_alpha_dims = [1, 2]

        input_lb = [xi.lower for xi in self.inputs]
        input_ub = [xi.upper for xi in self.inputs]
        input_lb = self._preprocess(*input_lb)
        input_ub = self._preprocess(*input_ub)
        x_l, x_u, y_l, y_u = self._reshape(input_lb[0], input_ub[0], input_lb[1], input_ub[1])
        assert x_l.ndim == y_l.ndim
        shape = [1 if i in shared_alpha_dims
                 else max(x_l.shape[i], y_l.shape[i]) for i in range(x_l.ndim)]
        for start_node in start_nodes:
            ns, size_s = start_node[:2]
            # start_node[3] == False means that this start node is not the final node
            # if not start_node[3]:
            #     # NOTE Experimental code. Please check how it will impact the results.
            #     size_s = 1
            if isinstance(size_s, torch.Size):
                # TODO do not give torch.Size
                size_s = prod(size_s)
            elif isinstance(size_s, (list, tuple)):
                size_s = size_s[0]
            self.alpha[ns] = torch.ones(4, size_s, *shape, device=x_l.device)

    def forward(self, x, w, b=None):
        x, w, b = self._preprocess(x, w, b)
        self.input_shape = self.x_shape = x.shape
        self.y_shape = w.t().shape
        res = x.matmul(w.t())
        if b is not None:
            res += b
        return res

    def onehot_mult(self, weight, bias, C, batch_size):
        """Multiply weight matrix with a diagonal matrix with selected rows."""

        if C is None:
            return None, 0.0

        new_weight = None
        new_bias = 0.0

        if C.index.ndim == 2:
            # Shape is [spec, batch]
            index = C.index.transpose(0, 1)
            coeffs = C.coeffs.transpose(0, 1)
        else:
            index = C.index
            coeffs = C.coeffs

        if C.index.ndim == 1:
            # Every element in the batch shares the same rows.
            if weight is not None:
                new_weight = self.non_deter_index_select(
                    weight, dim=0, index=index
                ).unsqueeze(1).expand(
                    [-1, batch_size] + [-1] * (weight.ndim - 1))
            if bias is not None:
                new_bias = self.non_deter_index_select(
                    bias, dim=0, index=index
                ).unsqueeze(1).expand(-1, batch_size)
        elif C.index.ndim == 2:
            # Every element in the batch has different rows, but the number of
            # rows are the same. This essentially needs a batched index_select function.
            if weight is not None:
                new_weight = batched_index_select(
                    weight.unsqueeze(0), dim=1, index=index)
            if bias is not None:
                new_bias = batched_index_select(
                    bias.unsqueeze(0), dim=1, index=index)
        if C.coeffs is not None:
            if weight is not None:
                new_weight = new_weight * coeffs.unsqueeze(-1)
            if bias is not None:
                new_bias = new_bias * coeffs
        if C.index.ndim == 2:
            # Eventually, the shape of A is [spec, batch, *node] so need a transpose.
            new_weight = new_weight.transpose(0, 1)
            new_bias = new_bias.transpose(0, 1)
        return new_weight, new_bias

    def bound_backward(self, last_lA, last_uA, *x, start_node=None,
                       reduce_bias=True, **kwargs):
        assert len(x) == 2 or len(x) == 3
        if start_node is not None:
            self._start = start_node.name
        has_bias = len(x) == 3
        # x[0]: input node, x[1]: weight, x[2]: bias
        input_lb = [xi.lower for xi in x]
        input_ub = [xi.upper for xi in x]
        if self.swap_x_and_weight:
            input_lb = [input_lb[1].transpose(-1, -2) if input_lb[1] is not None else None,
                        input_lb[0].transpose(-1, -2) if input_lb[0] is not None else None,
                        input_lb[2:]]
            input_ub = [input_ub[1].transpose(-1, -2) if input_ub[1] is not None else None,
                        input_ub[0].transpose(-1, -2) if input_ub[0] is not None else None,
                        input_ub[2:]]
            if last_lA is not None:
                if isinstance(last_lA, torch.Tensor):
                    last_lA = last_lA.transpose(-1, -2)
                elif isinstance(last_lA, eyeC):
                    last_lA = last_lA._replace(shape=last_lA.shape[:-2] + (last_lA.shape[-1], last_lA.shape[-2]))
                else:
                    raise NotImplementedError(
                        f"last_lA's type {type(last_lA)} is not supported for transpose in the case of swapping x and weight.")
            if last_uA is not None:
                if isinstance(last_uA, torch.Tensor):
                    last_uA = last_uA.transpose(-1, -2)
                elif isinstance(last_uA, eyeC):
                    last_uA = last_uA._replace(shape=last_uA.shape[:-2] + (last_uA.shape[-1], last_uA.shape[-2]))
                else:
                    raise NotImplementedError(
                        f"last_uA's type {type(last_uA)} is not supported for transpose in the case of swapping x and weight.")

        # transpose and scale each term if necessary.
        input_lb = self._preprocess(*input_lb)
        input_ub = self._preprocess(*input_ub)
        lA_y = uA_y = lA_bias = uA_bias = None
        lbias = ubias = 0
        batch_size = last_lA.shape[1] if last_lA is not None else last_uA.shape[1]
        weight = input_lb[1]
        bias = input_lb[2] if has_bias else None

        def _bound_oneside(last_A, weight_override=None):
            # For most applications, weight_override should be left as None
            # This will cause used_weight to be set to weight, which is the weight
            # assigned to input_lb[1]. The only reason provide an override weight
            # is if this layer has different weights for it's lower and upper bounds.
            # That is currently only the case for the implementation of output
            # constraints, where lower and upper bounds use distinct gammas.
            if weight_override is None:
                used_weight = weight
            else:
                used_weight = weight_override

            if last_A is None:
                return None, 0
            if isinstance(last_A, torch.Tensor):
                # Matrix mode.
                # Just multiply this layer's weight into bound matrices, and produce biases.
                if self.batched_weight_and_bias:
                    # last_A is the A at the current layer (self)
                    # next_A is the A for the layer consumed by the current (self) one
                    # "next_A" makes sense because we're backpropagating. However, the below shapes
                    # will refer to "prev_layer", which also is the layer that is consumed by
                    # the current (self) one. That's because they should match the documentation in
                    # output_constraints.py, which is written from a "forward facing" point of view.

                    # We have: last_A.shape = (unstable_neurons, batch_size, this_layer_neurons)
                    # We want: next_A.shape = (unstable_neurons, batch_size, prev_layer_neurons)

                    # We also have
                    # used_weight.shape = (batch_size, this_layer_neurons, prev_layer_neurons)

                    mod_last_A = last_A.unsqueeze(2)
                    mod_used_weight = used_weight.unsqueeze(0)
                    # mod_last_A.shape = (unstable_neurons, batch_size, 1, this_layer_neurons)
                    # mod_used_weight.shape = (1, batch_size, this_layer_neurons, prev_layer_neurons)

                    mod_next_A = mod_last_A.to(mod_used_weight).matmul(mod_used_weight)
                    # mod_next_A.shape = (unstable_neurons, batch_size, 1, prev_layer_neurons)

                    next_A = mod_next_A.squeeze(2)
                    # next_A.shape = (unstable_neurons, batch_size, prev_layer_neurons)

                    if has_bias:
                        # bias.shape = (batch_size, this_layer_neurons)

                        mod_bias = bias.unsqueeze(0).unsqueeze(3)
                        # mod_bias.shape = (1, batch_size, this_layer_neurons, 1)
                        # mod_last_A.shape = (unstable_neurons, batch_size, 1, this_layer_neurons)

                        mod_sum_bias = mod_last_A.to(mod_bias).matmul(mod_bias)
                        # mod_sum_bias.shape = (unstable_neurons, batch_size, 1, 1)

                        sum_bias = mod_sum_bias.squeeze(3).squeeze(2)
                        # sum_bias.shape = (unstable_neurons, batch_size)
                else:
                    next_A = last_A.to(used_weight).matmul(used_weight)
                    sum_bias = (last_A.to(bias).matmul(bias)
                        if has_bias else 0.0)
            else:
                assert isinstance(last_A, Patches)
                assert not self.batched_weight_and_bias
                # Patches mode. After propagating through this layer, it will become a matrix.
                # Reshape the weight matrix as a conv image.
                # Weight was in (linear_output_shape, linear_input_shape)
                # Reshape it to (linear_input_shape, c, h, w)
                reshaped_weight = used_weight.transpose(0, 1).view(
                    -1, *last_A.input_shape[1:])
                # After unfolding the shape is
                # (linear_input_shape, output_h, output_w, in_c, patch_h, patch_w)
                unfolded_weight = inplace_unfold(
                    reshaped_weight,
                    kernel_size=last_A.patches.shape[-2:],
                    stride=last_A.stride, padding=last_A.padding,
                    inserted_zeros=last_A.inserted_zeros,
                    output_padding=last_A.output_padding)
                if has_bias:
                    # Do the same for the bias.
                    reshaped_bias = bias.view(*last_A.input_shape[1:]).unsqueeze(0)
                    # After unfolding the bias shape is (1, output_h, output_w, in_c, patch_h, patch_w)
                    unfolded_bias = inplace_unfold(
                        reshaped_bias, kernel_size=last_A.patches.shape[-2:],
                        stride=last_A.stride, padding=last_A.padding,
                        inserted_zeros=last_A.inserted_zeros,
                        output_padding=last_A.output_padding)
                if last_A.unstable_idx is not None:
                    # In this case, the last_A shape is (num_unstable, batch, out_c, patch_h, patch_w)
                    # Reshape our weight to (output_h, output_w, 1, in_c, patch_h, patch_w, linear_input_shape), 1 is the inserted batch dim.
                    unfolded_weight_r = unfolded_weight.permute(1, 2, 3, 4, 5, 0).unsqueeze(2)
                    # for sparse patches the shape is (unstable_size, batch, in_c, patch_h, patch_w). Batch size is 1 so no need to select here.
                    # We select in the (output_h, out_w) dimension.
                    selected_weight = unfolded_weight_r[last_A.unstable_idx[1], last_A.unstable_idx[2]]
                    next_A = torch.einsum('sbchw,sbchwi->sbi', last_A.patches, selected_weight)
                    if has_bias:
                        # Reshape our bias to (output_h, output_w, 1, in_c, patch_h, patch_w). We already have the batch dim.
                        unfolded_bias_r = unfolded_bias.permute(1, 2, 0, 3, 4, 5)
                        selected_bias = unfolded_bias_r[last_A.unstable_idx[1], last_A.unstable_idx[2]]
                        sum_bias = torch.einsum('sbchw,sbchw->sb', last_A.patches, selected_bias)
                else:
                    # Reshape our weight to (1, 1, output_h, output_w, in_c, patch_h, patch_w, linear_input_shape), 1 is the spec and batch.
                    selected_weight = unfolded_weight.permute(1, 2, 3, 4, 5, 0).unsqueeze(0).unsqueeze(0)
                    next_A_r = torch.einsum('sbpqchw,sbpqchwi->spqbi', last_A.patches, selected_weight)
                    # We return a matrix with flattened spec dimension (corresponding to out_c * out_h * out_w).
                    next_A = next_A_r.reshape(-1, next_A_r.size(-2), next_A_r.size(-1))
                    if has_bias:
                        # Reshape our bias to (1, 1, output_h, output_w, in_c, patch_h, patch_w)
                        selected_bias = unfolded_bias.unsqueeze(0)
                        sum_bias_r = torch.einsum('sbpqchw,sbpqchw->spqb', last_A.patches, selected_bias)
                        sum_bias = sum_bias_r.reshape(-1, sum_bias_r.size(-1))
            return next_A, sum_bias if has_bias else 0.0

        # Case #1: No weight/bias perturbation, only perturbation on input.
        if ((not self.is_input_perturbed(0) or not self.is_input_perturbed(1)) and 
            (not has_bias or not self.is_input_perturbed(2))):
            # If last_lA and last_uA are indentity matrices.
            # FIXME (12/28): we should check last_lA and last_uA separately.
            # Same applies to the weight perturbed, bias perturbed settings.

            def multiply_with_weight(weight, set_l: bool, set_u: bool):
                lA_x = uA_x = None
                lbias = ubias = 0.
                if isinstance(last_lA, eyeC) and isinstance(last_uA, eyeC):
                    # Use this layer's W as the next bound matrices.
                    # Shape of inputs: (B, s_k, s_{k-1}, ..., s_1, m, n) @ (s_l, s_{l-1}, ..., s_1, n, p)
                    #               or (B, s_k, s_{k-1}, ..., s_1, m, n) @ (B, s_k, s_{k-1}, ..., s_1, n, p)
                    # Shape of output: (B, s_k, ..., s_1, m, p)
                    # last_lA: (specs, B, s_k, ..., s_1, m, p)
                    # weight: (s_l, ..., s_1, p, n) where l <= k, or (B, s_k, ..., s_1, p, n)

                    if len(last_lA.shape) == 3:
                        # input x is a vector
                        m = 1
                        p = last_lA.shape[-1]
                    else:
                        # general input shape
                        m, p = last_lA.shape[-2:]
                    n = weight.size(-1)

                    assert last_lA.shape == last_uA.shape
                    # shape of "broadcast dimensions" \prod_{i=1...k} s_i
                    shape_broadcast = last_lA.shape[2:-2]
                    prod_broadcast = prod(shape_broadcast)
                    ndim_broadcast = len(shape_broadcast)

                    assert weight.ndim - 3 <= ndim_broadcast, "Broadcasting on input 'x' is not supported."
                    weight_has_batch = weight.ndim - 3 == ndim_broadcast

                    # A_identity: (s_k, ...s_1, m, 1, s_k, ..., s_1, m, 1) where two 1s are for the two "matmul dimensions"
                    A_identity = torch.eye(
                        prod_broadcast * m, device=weight.device, dtype=weight.dtype
                    ).view(*shape_broadcast, m, 1, *shape_broadcast, m, 1)
                    # Assert specs = {product of shape of output} = \prod s_i * m * p
                    assert last_lA.shape[0] == prod_broadcast * m * p

                    if not weight_has_batch:
                        # Pad the "broadcast dimensions" of weight according to shape of input
                        # (s_l, ..., s_1, p, n) -> (1, ..., 1, s_l, ..., s_1, p, n) where there are (k-l) 1s
                        w_padding = weight.reshape(*[1] * (ndim_broadcast + 2 - len(weight.shape)), *weight.shape)
                        # Duplicate the "broadcast dimensions" to match both sides of A_identity
                        # (*broadcast_dims, p, n) -> (*broadcast_dims, p, *broadcast_dims, n)
                        w_eye_mask = torch.eye(prod_broadcast, device=weight.device, dtype=weight.dtype).reshape(*shape_broadcast, 1, *shape_broadcast, 1)
                        w = w_eye_mask * w_padding.reshape(*w_padding.shape[:-1], *[1] * (len(w_padding.shape) - 2), w_padding.size(-1))
                        # Add two slots for the "m" dimension in A_identity
                        # (*broadcast_dims, p, *broadcast_dims, n) -> (*broadcast_dims, 1, p, *broadcast_dims, 1, n)
                        w = w.view(*w.shape[:ndim_broadcast], 1, p, *w.shape[:ndim_broadcast], 1, n)
                        w = w * A_identity  # (*broadcast_dims, m, p, *broadcast_dims, m, n)
                        # expand the batch_size dim
                        # (*broadcast_dims, m, p, *broadcast_dims, m, n) -> (Prod(broadcast_dims)*m*p, B, *broadcast_dims, m, n)
                        tmp_A_x = w.reshape(last_lA.shape[0], 1, *last_lA.shape[2:-1], weight.size(-1)).expand(last_lA.shape[0], *last_lA.shape[1:-1], weight.size(-1))
                    else:
                        # There's no need to pad the weight tensor if it has a batch dimension.
                        # Duplicate the "broadcast dimensions" to match both sides of A_identity
                        # (B, *broadcast_dims, p, n) -> (B, *broadcast_dims, p, *broadcast_dims, n)
                        w_eye_mask = torch.eye(prod_broadcast, device=weight.device, dtype=weight.dtype).reshape(*shape_broadcast, 1, *shape_broadcast, 1)
                        w = w_eye_mask * weight.reshape(*weight.shape[:-1], *[1] * (len(weight.shape) - 3), weight.size(-1))
                        # Add two slots for the "m" dimension in A_identity
                        # (B, *broadcast_dims, p, *broadcast_dims, n) -> (B, *broadcast_dims, 1, p, *broadcast_dims, 1, n)
                        w = w.view(w.shape[0], *w.shape[1:ndim_broadcast+1], 1, p, *w.shape[1:ndim_broadcast+1], 1, n)
                        w = w * A_identity  # (B, *broadcast_dims, m, p, *broadcast_dims, m, n)
                        # (B, *broadcast_dims, m, p, *broadcast_dims, m, n) -> (Prod(broadcast_dims)*m*p, B, *broadcast_dims, m, n)
                        tmp_A_x = w.reshape(w.shape[0], last_lA.shape[0], *last_lA.shape[2:-1], weight.size(-1)).transpose(0, 1)                            
                    if set_l:
                        lA_x = tmp_A_x
                    if set_u:
                        uA_x = tmp_A_x

                    if has_bias:
                        tmp_bias = bias.unsqueeze(1).repeat(1, batch_size)
                        if set_l:
                            lbias = tmp_bias
                        if set_u:
                            ubias = tmp_bias
                elif isinstance(last_lA, OneHotC) or isinstance(last_uA, OneHotC):
                    # We need to select several rows from the weight matrix
                    # (its shape is output_size * input_size).
                    if set_l:
                        lA_x, lbias = self.onehot_mult(weight, bias, last_lA, batch_size)
                    if last_lA is last_uA and set_l and set_u:
                        uA_x = lA_x
                        ubias = lbias
                    elif set_u:
                        uA_x, ubias = self.onehot_mult(weight, bias, last_uA, batch_size)
                else:
                    if set_l:
                        lA_x, lbias = _bound_oneside(last_lA, weight_override=weight)
                    if set_u:
                        uA_x, ubias = _bound_oneside(last_uA, weight_override=weight)
                return lA_x, uA_x, lbias, ubias

            if self.use_seperate_weights_for_lower_and_upper_bounds:
                lA_x, _, lbias, _ = multiply_with_weight(input_lb[1], set_l=True, set_u=False)
                _, uA_x, _, ubias = multiply_with_weight(input_ub[1], set_l=False, set_u=True)
            else:
                lA_x, uA_x, lbias, ubias = multiply_with_weight(weight, set_l=True, set_u=True)

        # Case #2: weight is perturbed. bias may or may not be perturbed.
        elif self.is_input_perturbed(1):
            assert not self.use_seperate_weights_for_lower_and_upper_bounds
            # Obtain relaxations for matrix multiplication.
            [(lA_x, uA_x), (lA_y, uA_y)], lbias, ubias = self.bound_backward_with_weight(
                last_lA, last_uA, input_lb, input_ub, x[0], x[1],
                reduce_bias=reduce_bias, **kwargs)
            if has_bias:
                assert reduce_bias
                if x[2].perturbation is not None:
                    # Bias is also perturbed. Since bias is directly added to the
                    # output, in backward mode it is treated as an input with
                    # last_lA and last_uA as associated bounds matrices.
                    # It's okay if last_lA or last_uA is eyeC, as it will be
                    # handled in the perturbation object.
                    lA_bias = last_lA
                    uA_bias = last_uA
                else:
                    # Bias not perturbed, so directly adding the bias of this
                    # layer to the final bound bias term.
                    if isinstance(last_lA, eyeC) and isinstance(last_uA, eyeC):
                        # Bias will be directly added to output.
                        lbias += input_lb[2].unsqueeze(1).repeat(1, batch_size)
                        ubias += input_lb[2].unsqueeze(1).repeat(1, batch_size)
                    else:
                        if last_lA is not None:
                            lbias += last_lA.matmul(input_lb[2])
                        if last_uA is not None:
                            ubias += last_uA.matmul(input_lb[2])
            # If not has_bias, no need to compute lA_bias and uA_bias
        # Case 3: Only bias is perturbed, weight is not perturbed.
        elif not self.is_input_perturbed(1) and has_bias and self.is_input_perturbed(2):
            assert not self.use_seperate_weights_for_lower_and_upper_bounds
            assert reduce_bias
            if isinstance(last_lA, eyeC) and isinstance(last_uA, eyeC):
                # Use this layer's W as the next bound matrices. Duplicate the
                # batch dimension. Other dimensions are kept 1.
                lA_x = uA_x = input_lb[1].unsqueeze(1).repeat(
                    [1, batch_size] + [1] * (input_lb[1].ndim - 1))
            else:
                lA_x = last_lA.matmul(input_lb[1])
                uA_x = last_uA.matmul(input_lb[1])
            # It's okay if last_lA or last_uA is eyeC, as it will be handled in the perturbation object.
            lA_bias = last_lA
            uA_bias = last_uA
        else:
            assert not self.use_seperate_weights_for_lower_and_upper_bounds

        if self.swap_x_and_weight:
            return [(None, None),
                    (lA_x.transpose(-1, -2) if lA_x is not None else None,
                     uA_x.transpose(-1, -2) if uA_x is not None else None),
                    (lA_bias, uA_bias)], lbias, ubias
        return [(lA_x, uA_x), (lA_y, uA_y), (lA_bias, uA_bias)], lbias, ubias

    def _reshape(self, x_l, x_u, y_l, y_u):
        x_shape, y_shape = self.input_shape, self.y_shape

        # (x_1, x_2, ..., x_{n-1}, -1, x_n) # FIXME
        x_l = x_l.unsqueeze(-2)
        x_u = x_u.unsqueeze(-2)

        # FIXME merge these two cases
        if len(x_shape) == len(y_shape):
            # (x_1, x_2, ..., -1, y_n, y_{n-1})
            y_l = y_l.unsqueeze(-3)
            y_u = y_u.unsqueeze(-3)
        elif len(y_shape) == 2:
            # (x_1, x_2, ..., -1, y_2, y_1)
            y_l = y_l.reshape(*([1] * (len(x_shape) - 2)), *y_shape).unsqueeze(-3)
            y_u = y_u.reshape(*([1] * (len(x_shape) - 2)), *y_shape).unsqueeze(-3)
        else:
            raise ValueError(f'Unsupported shapes: x_shape {x_shape}, y_shape {y_shape}')

        return x_l, x_u, y_l, y_u

    @staticmethod
    # @torch.jit.script
    def propagate_A_xy(last_A: Tensor, alpha_pos: Tensor, alpha_neg: Tensor,
                       beta_pos: Tensor, beta_neg: Tensor,
                       dim_y: List[int]) -> Tuple[Tensor, Tensor]:
        # last_uA has size (batch, spec, output)
        last_A_pos = last_A.clamp(min=0).unsqueeze(-1)
        last_A_neg = last_A.clamp(max=0).unsqueeze(-1)
        # alpha_u has size (batch, spec, output, input)
        # uA_x has size (batch, spec, input).
        A_x = (alpha_pos.transpose(-1, -2).matmul(last_A_pos) +
                alpha_neg.transpose(-1, -2).matmul(last_A_neg)).squeeze(-1)
        # beta_u has size (batch, spec, output, input)
        # uA_y is for weight matrix, with parameter size (output, input)
        # uA_y has size (batch, spec, output, input). This is an element-wise multiplication.
        # TODO (for zhouxing/qirui): generalize multiply_by_A_signs() to calculate A_x,
        # so last_A_pos and last_A_neg are not needed. This saves memory.
        A_y, _ = multiply_by_A_signs(last_A.unsqueeze(-1), beta_pos, beta_neg, None, None)
        if len(dim_y) != 0:
            A_y = torch.sum(A_y, dim=dim_y)
        return A_x, A_y

    def bound_backward_with_weight(self, last_lA, last_uA, input_lb, input_ub,
                                   x, y, reduce_bias=True, **kwargs):
        # FIXME This is nonlinear. Move to `bivariate.py`.

        # Note: x and y are not tranposed or scaled, and we should avoid using them directly.
        # Use input_lb and input_ub instead.
        (alpha_l, beta_l, gamma_l,
         alpha_u, beta_u, gamma_u) = self.mul_helper.get_relaxation(
            *self._reshape(input_lb[0], input_ub[0], input_lb[1], input_ub[1]),
            self.opt_stage, getattr(self, 'alpha', None),
            getattr(self, '_start', None), middle=self.mul_middle)
        x_shape = input_lb[0].size()
        if reduce_bias:
            gamma_l = torch.sum(gamma_l, dim=-1)
            gamma_u = torch.sum(gamma_u, dim=-1)

        if len(x.output_shape) != 2 and len(x.output_shape) == len(y.output_shape):
            dim_y = [-3]
        elif len(y.output_shape) == 2:
            dim_y = list(range(2, 2 + len(x_shape) - 2))
        else:
            raise NotImplementedError

        def _bound_oneside(last_A, alpha_pos, beta_pos, gamma_pos, alpha_neg, beta_neg, gamma_neg):
            if last_A is None:
                return None, None, 0
            if isinstance(last_A, eyeC):  # FIXME (12/28): Handle the OneHotC case.
                #FIXME previous implementation is incorrect
                #      expanding eyeC for now
                last_A = (torch.eye(last_A.shape[0], device=last_A.device)
                    .view(last_A.shape[0], 1, *last_A.shape[2:]).expand(last_A.shape))

            A_x, A_y = BoundLinear.propagate_A_xy(
                last_A, alpha_pos, alpha_neg, beta_pos, beta_neg, dim_y)

            if reduce_bias:
                # last_uA has size (batch, spec, output)
                # gamma_u has size (batch, output, 1)
                # ubias has size (batch, spec, 1)
                if self.opt_stage in ['opt', 'reuse']:
                    bias = (torch.einsum('sb...,sb...->sb',
                                        last_A.clamp(min=0), gamma_pos)
                            + torch.einsum('sb...,sb...->sb',
                                        last_A.clamp(max=0), gamma_neg))
                else:
                    bias = (
                        self.get_bias(last_A.clamp(min=0), gamma_pos)
                        + self.get_bias(last_A.clamp(max=0), gamma_neg)
                    )
            else:
                assert self.batch_dim == 0
                assert self.opt_stage not in ['opt', 'reuse']
                assert dim_y == [-3]
                bias = (last_A.unsqueeze(-1).clamp(min=0) * gamma_pos
                        + last_A.unsqueeze(-1).clamp(max=0) * gamma_neg)
                bias_x = bias.sum(dim=-2)
                bias_y = bias.sum(dim=-3)
                bias = (bias_x, bias_y)
            return A_x, A_y, bias

        if self.opt_stage in ['opt', 'reuse']:
            lA_x, lA_y, lbias = _bound_oneside(
                last_lA, alpha_l[0], beta_l[0], gamma_l[0],
                alpha_u[0], beta_u[0], gamma_u[0])
            uA_x, uA_y, ubias = _bound_oneside(
                last_uA, alpha_u[1], beta_u[1], gamma_u[1],
                alpha_l[1], beta_l[1], gamma_l[1])
        else:
            lA_x, lA_y, lbias = _bound_oneside(
                last_lA, alpha_l, beta_l, gamma_l, alpha_u, beta_u, gamma_u)
            uA_x, uA_y, ubias = _bound_oneside(
                last_uA, alpha_u, beta_u, gamma_u, alpha_l, beta_l, gamma_l)

        return [(lA_x, uA_x), (lA_y, uA_y)], lbias, ubias

    @staticmethod
    def _propagate_Linf(x, w):
        h_L, h_U = x
        mid = (h_L + h_U) / 2
        diff = (h_U - h_L) / 2
        w_abs = w.abs()
        if mid.ndim == 2 and w.ndim == 3:
            center = torch.bmm(mid.unsqueeze(1), w.transpose(-1, -2)).squeeze(1)
            deviation = torch.bmm(diff.unsqueeze(1), w_abs.transpose(-1, -2)).squeeze(1)
        else:
            center = mid.matmul(w.transpose(-1, -2))
            deviation = diff.matmul(w_abs.transpose(-1, -2))
        return center, deviation

    def interval_propagate(self, *v, C=None, w=None):
        has_bias = self is not None and len(v) == 3
        if self is not None:
            # This will convert an Interval object to tuple.
            # We need to add perturbation property later.
            v_lb, v_ub = zip(*v)
            v_lb = self._preprocess(*v_lb)
            v_ub = self._preprocess(*v_ub)
            # After preprocess the lower and upper bounds, we make them Intervals again.
            v = [Interval.make_interval(bounds[0], bounds[1], bounds[2])
                 for bounds in zip(v_lb, v_ub, v)]
        if w is None and self is None:
            # Use C as the weight, no bias.
            w, lb, ub = C, torch.tensor(0., device=C.device), torch.tensor(0., device=C.device)
        else:
            if w is None:
                # No specified weight, use this layer's weight.
                if self.is_input_perturbed(1):  # input index 1 is weight.
                    # w is a perturbed tensor. Use IBP with weight perturbation.
                    # C matrix merging not supported.
                    assert C is None
                    res = self.interval_propagate_with_weight(*v)
                    l, u = res
                    if has_bias:
                        return l + v[2][0], u + v[2][1]
                    else:
                        return l, u
                else:
                    # Use weight
                    w = v[1][0]
            if has_bias:
                lb, ub = v[2]
            else:
                lb = ub = 0.0

            if C is not None:
                w = C.matmul(w)
                lb = C.matmul(lb) if not isinstance(lb, float) else lb
                ub = C.matmul(ub) if not isinstance(ub, float) else ub

        # interval_propagate() of the Linear layer may encounter input with different norms.
        norm, eps = Interval.get_perturbation(v[0])[:2]
        if norm == torch.inf:
            interval = BoundLinear._propagate_Linf(v[0], w)
            center, deviation = interval
        elif norm > 0:
            # General Lp norm.
            norm, eps = Interval.get_perturbation(v[0])
            mid = v[0][0]
            dual_norm = np.float64(1.0) / (1 - 1.0 / norm)
            if w.ndim == 3:
                # Extra batch dimension.
                # mid has dimension [batch, input], w has dimension [batch, output, input].
                center = w.matmul(mid.unsqueeze(-1)).squeeze(-1)
            else:
                # mid has dimension [batch, input], w has dimension [output, input].
                center = mid.matmul(w.t())
            deviation = w.norm(dual_norm, dim=-1) * eps
        else:
            # here we calculate the L0 norm IBP bound of Linear layers,
            # using the bound proposed in [Certified Defenses for Adversarial Patches, ICLR 2020]
            norm, eps, ratio = Interval.get_perturbation(v[0])
            mid = v[0][0]
            weight_abs = w.abs()
            if w.ndim == 3:
                # Extra batch dimension.
                # mid has dimension [batch, input], w has dimension [batch, output, input].
                center = w.matmul(mid.unsqueeze(-1)).squeeze(-1)
            else:
                # mid has dimension [batch, input], w has dimension [output, input].
                center = mid.matmul(w.t())
            # L0 norm perturbation
            k = int(eps)
            deviation = torch.sum(torch.topk(weight_abs, k)[0], dim=1) * ratio

        lower, upper = center - deviation + lb, center + deviation + ub

        return (lower, upper)

    def interval_propagate_with_weight(self, *v):
        input_norm, input_eps = Interval.get_perturbation(v[0])
        weight_norm, weight_eps = Interval.get_perturbation(v[1])

        if input_norm == torch.inf and weight_norm == torch.inf:
            # A memory-efficient implementation without expanding all the elementary multiplications
            if self.opt_matmul == 'economic':
                x_l, x_u = v[0][0], v[0][1]
                y_l, y_u = v[1][0].transpose(-1, -2), v[1][1].transpose(-1, -2)

                dx, dy = F.relu(x_u - x_l), F.relu(y_u - y_l)
                base = x_l.matmul(y_l)

                mask_xp, mask_xn = (x_l > 0).to(x_l.dtype), (x_u < 0).to(x_u.dtype)
                mask_xpn = 1 - mask_xp - mask_xn
                mask_yp, mask_yn = (y_l > 0).to(y_l.dtype), (y_u < 0).to(y_u.dtype)
                mask_ypn = 1 - mask_yp - mask_yn

                lower, upper = base.clone(), base.clone()

                lower += dx.matmul(y_l.clamp(max=0)) - (dx * mask_xn).matmul(y_l * mask_ypn)
                upper += dx.matmul(y_l.clamp(min=0)) + (dx * mask_xp).matmul(y_l * mask_ypn)

                lower += x_l.clamp(max=0).matmul(dy) - (x_l * mask_xpn).matmul(dy * mask_yn)
                upper += x_l.clamp(min=0).matmul(dy) + (x_l * mask_xpn).matmul(dy * mask_yp)

                lower += (dx * mask_xn).matmul(dy * mask_yn)
                upper += (dx * (mask_xpn + mask_xp)).matmul(dy * (mask_ypn + mask_yp))
            else:
                # Both input data and weight are Linf perturbed (with upper and lower bounds).
                # We need a x_l, x_u for each row of weight matrix.
                x_l, x_u = v[0][0].unsqueeze(-2), v[0][1].unsqueeze(-2)
                y_l, y_u = v[1][0].unsqueeze(-3), v[1][1].unsqueeze(-3)
                # Reuse the multiplication bounds and sum over results.
                lower, upper = BoundMul.interval_propagate_both_perturbed(*[(x_l, x_u), (y_l, y_u)])
                lower, upper = torch.sum(lower, -1), torch.sum(upper, -1)

            return lower, upper
        elif input_norm == torch.inf and weight_norm == 2:
            # This eps is actually the epsilon per row, as only one row is involved for each output element.
            eps = weight_eps
            # Input data and weight are Linf perturbed (with upper and lower bounds).
            h_L, h_U = v[0]
            # First, handle non-perturbed weight with Linf perturbed data.
            center, deviation = BoundLinear._propagate_Linf(v[0], v[1][0])
            # Compute the maximal L2 norm of data. Size is [batch, 1].
            max_l2 = torch.max(h_L.abs(), h_U.abs()).norm(2, dim=-1).unsqueeze(-1)
            # Add the L2 eps to bounds.
            lb, ub = center - deviation - max_l2 * eps, center + deviation + max_l2 * eps
            return lb, ub
        else:
            raise NotImplementedError(
                "Unsupported perturbation combination: data={}, weight={}".format(input_norm, weight_norm))

    @staticmethod
    @torch.jit.script
    def bound_forward_mul(x_lw: Tensor, x_lb: Tensor, x_uw: Tensor, x_ub: Tensor,
                          w: Tensor, weight_has_batch: bool = False, swap_x_and_weight: bool = False):
        w_pos = w.clamp(min=0)
        w_neg = w.clamp(max=0)
        if swap_x_and_weight:
            lw = matmul_maybe_batched(w_pos, x_lw, weight_has_batch) + matmul_maybe_batched(w_neg, x_uw, weight_has_batch)
            uw = matmul_maybe_batched(w_pos, x_uw, weight_has_batch) + matmul_maybe_batched(w_neg, x_lw, weight_has_batch)
            lb = matmul_maybe_batched(w_pos, x_lb, weight_has_batch) + matmul_maybe_batched(w_neg, x_ub, weight_has_batch)
            ub = matmul_maybe_batched(w_pos, x_ub, weight_has_batch) + matmul_maybe_batched(w_neg, x_lb, weight_has_batch)
        else:
            lw = matmul_maybe_batched(x_lw, w_pos, weight_has_batch) + matmul_maybe_batched(x_uw, w_neg, weight_has_batch)
            uw = matmul_maybe_batched(x_uw, w_pos, weight_has_batch) + matmul_maybe_batched(x_lw, w_neg, weight_has_batch)
            lb = matmul_maybe_batched(x_lb, w_pos, weight_has_batch) + matmul_maybe_batched(x_ub, w_neg, weight_has_batch)
            ub = matmul_maybe_batched(x_ub, w_pos, weight_has_batch) + matmul_maybe_batched(x_lb, w_neg, weight_has_batch)
        return lw, lb, uw, ub

    # w: an optional argument which can be utilized by BoundMatMul
    def bound_dynamic_forward(self, x, w=None, b=None, C=None, max_dim=None, offset=0):
        assert not self.transA and self.alpha_linear == 1.0 and self.transB and self.beta_linear == 1.0
        assert not self.is_input_perturbed(1)
        assert not self.is_input_perturbed(2)

        weight = w.lb
        bias = b.lb if b is not None else None
        if C is not None:
            weight = C.to(weight).matmul(weight).transpose(-1, -2)
            if bias is not None:
                bias = C.to(bias).matmul(bias)
            lb = x.lb.unsqueeze(1)
        else:
            weight = weight.transpose(-1, -2)
            lb = x.lb

        w_new = x.lw.matmul(weight)
        b_new = lb.matmul(weight)
        if C is not None:
            b_new = b_new.squeeze(1)
        if bias is not None:
            b_new += bias

        return LinearBound(w_new, b_new, w_new, b_new, x_L=x.x_L, x_U=x.x_U, tot_dim=x.tot_dim)

    # w: an optional argument which can be utilized by BoundMatMul
    def bound_forward(self, dim_in, x, w=None, b=None, C=None, weight_has_batch=False):
        has_bias = b is not None
        #FIXME _preprocess can only be applied to tensors so far but not linear bounds.
        x, w, b = self._preprocess(x, w, b)

        # Shape of x: (B, s_k, s_{k-1}, ..., s_1, m, n)
        # Shape of w: (s_l, s_{l-1}, ..., s_1, p, n) or (B, s_k, s_{k-1}, ..., s_1, p, n) if weight_has_batch
        # Forward pass: (B, s_k, s_{k-1}, ..., s_1, m, n) @ (s_l, s_{l-1}, ..., s_1, p, n)^T
        # Here, the transpose of w means transposing the last two dimensions of w.

        # Case #1: No weight/bias perturbation, only perturbation on input.
        if ((not self.is_input_perturbed(0) or not self.is_input_perturbed(1)) and
            (not has_bias or not self.is_input_perturbed(2))):
            if isinstance(w, LinearBound):
                w = w.lower
            if isinstance(b, LinearBound):
                b = b.lower
            if C is not None:
                w = C.to(w).matmul(w).transpose(-1, -2)
                if b is not None:
                    b = C.to(b).matmul(b)
                x_lb, x_ub = x.lb.unsqueeze(1), x.ub.unsqueeze(1)
            else:
                w = w.transpose(-1, -2)
                x_lb, x_ub = x.lb, x.ub
            lw, lb, uw, ub = BoundLinear.bound_forward_mul(
                x.lw, x_lb, x.uw, x_ub, w, weight_has_batch,
                swap_x_and_weight=self.is_input_perturbed(1))

            if C is not None:
                lb, ub = lb.squeeze(1), ub.squeeze(1)

            if b is not None:
                lb += b
                ub += b
        # Case #2: weight is perturbed. bias may or may not be perturbed.
        elif self.is_input_perturbed(1):
            if C is not None:
                raise NotImplementedError
            res = self.bound_forward_with_weight(dim_in, x, w)
            if has_bias:
                raise NotImplementedError
            lw, lb, uw, ub = res.lw, res.lb, res.uw, res.ub
        # Case 3: Only bias is perturbed, weight is not perturbed.
        elif not self.is_input_perturbed(1) and has_bias and self.is_input_perturbed(2):
            raise NotImplementedError

        return LinearBound(lw, lb, uw, ub)

    def bound_forward_with_weight(self, dim_in, x, y):
        # x has shape (B, s_k, s_{k-1}, ..., s_1, m, n)
        # y has shape (B, s_k, s_{k-1}, ..., s_1, p, n)
        # We need to reshape x and y to (B, s_k, s_{k-1}, ..., s_1, m, 1, n)
        #                           and (B, s_k, s_{k-1}, ..., s_1, 1, p, n)
        # respectively.
        # Then we can use the bound_forward_mul function to compute the bounds
        # for element-wise multiplication and sum over the last dimension.
        # The result will have shape (B, s_k, s_{k-1}, ..., s_1, m, p)
        x_unsqueeze = LinearBound(
            x.lw.unsqueeze(-2), x.lb.unsqueeze(-2),
            x.uw.unsqueeze(-2), x.ub.unsqueeze(-2),
            x.lower.unsqueeze(-2), x.upper.unsqueeze(-2),
        )
        y_unsqueeze = LinearBound(
            y.lw.unsqueeze(-3), y.lb.unsqueeze(-3),
            y.uw.unsqueeze(-3), y.ub.unsqueeze(-3),
            y.lower.unsqueeze(-3), y.upper.unsqueeze(-3),
        )
        res_mul = BoundMul.bound_forward_both_perturbed(self, dim_in, x_unsqueeze, y_unsqueeze)
        return LinearBound(
            res_mul.lw.sum(dim=-1) if res_mul.lw is not None else None,
            res_mul.lb.sum(dim=-1),
            res_mul.uw.sum(dim=-1) if res_mul.uw is not None else None,
            res_mul.ub.sum(dim=-1)
        )

    def build_solver(self, *v, model, C=None, model_type="mip", solver_pkg="gurobi"):
        has_bias = self is not None and len(v) == 3
        # Aggregate a batch of bounds by taking minimum/maximum over the batch dimension.
        out_lbs = self.lower.min(dim=0).values.detach().cpu().numpy() if self.lower is not None else None
        out_ubs = self.upper.max(dim=0).values.detach().cpu().numpy() if self.upper is not None else None

        # current layer weight (out_width, in_width)
        this_layer_weight = v[1]
        if self.transB == 0:
            this_layer_weight = this_layer_weight.transpose(1, 0)
        #### make sure if this is correct for per-label operations
        if C is not None:
            # merge specification C into last layer weights
            # only last layer has C not None
            this_layer_weight = C.squeeze(0).mm(this_layer_weight)
        this_layer_weight = this_layer_weight.detach().cpu().numpy()
        this_layer_shape = this_layer_weight.shape

        this_layer_bias = None
        if has_bias:
            # current layer bias (out_width,)
            this_layer_bias = v[2]
            if C is not None:
                this_layer_bias = C.squeeze(0).mm(this_layer_bias.unsqueeze(-1)).view(-1)
            this_layer_bias = this_layer_bias.detach().cpu().numpy()

        new_layer_gurobi_vars = []

        for neuron_idx in range(this_layer_shape[0]):
            out_lb = out_lbs[neuron_idx] if out_lbs is not None else -float('inf')
            out_ub = out_ubs[neuron_idx] if out_ubs is not None else float('inf')
            if out_lbs is not None and out_ubs is not None:
                """
                    If the inferred lb and ub are too close, it could lead to floating point disagreement
                    between solver's inferred lb and ub constraints and the computed ones from ab-crown.
                    Such disagreement can lead to "infeasible" result from the solver for feasible problem.
                    Also, prevent lb to be larger than ub due to the floating point issue.
                    To avoid so, we relax the box constraints.
                    This should not affect the solver's result correctness,
                    since the tighter lb and ub can be inferred by the solver.
                """
                if out_lb != float('-inf') and out_ub != float('inf'):
                    diff = out_ub - out_lb
                    avg = (out_ub + out_lb) / 2.0
                    condition = (diff < EPS)
                    out_lb = np.where(condition, avg - EPS / 2.0, out_lb)
                    out_ub = np.where(condition, avg + EPS / 2.0, out_ub)
            lin_expr = 0
            if has_bias:
                lin_expr = this_layer_bias[neuron_idx].item()
            coeffs = this_layer_weight[neuron_idx, :]

            if solver_pkg == 'gurobi':
                lin_expr += grb.LinExpr(coeffs, v[0])
            else:
                # FIXME (01/12/22): This is slow, must be fixed using addRow() or similar.
                for i in range(len(coeffs)):
                    try:
                        lin_expr += coeffs[i] * v[0][i]
                    except TypeError:
                        lin_expr += coeffs[i] * v[0][i].var

            var = model.addVar(lb=out_lb, ub=out_ub, obj=0,
                                    vtype=grb.GRB.CONTINUOUS,
                                    name=f'lay{self.name}_{neuron_idx}')
            model.addConstr(lin_expr == var, name=f'lay{self.name}_{neuron_idx}_eq')
            new_layer_gurobi_vars.append(var)

        self.solver_vars = new_layer_gurobi_vars
        model.update()

    def build_gradient_node(self, grad_upstream):
        if not self.is_input_perturbed(1):
            if isinstance(self.inputs[1], BoundParams):
                w = self.inputs[1].param
            elif isinstance(self.inputs[1], BoundBuffers):
                w = self.inputs[1].buffer
            else:
                w = self.inputs[1].value
            if not self.transB:
                w = w.t()
            node_grad = LinearGrad(w.detach())
            return [(node_grad, (grad_upstream,), [])]
        else:
            raise NotImplementedError(
                "Gradient computation for weight perturbation is not supported yet.")

    def update_requires_input_bounds(self):
        self._check_weight_perturbation()


class BoundMatMul(BoundLinear):
    # Reuse most functions from BoundLinear.
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.transA = 0
        self.transB = 0
        self.splittable = True

    def forward(self, x, y):
        self.x_shape = x.shape
        self.y_shape = y.shape
        return x.matmul(y)

    def interval_propagate(self, *v, C=None):
        lower, upper = super().interval_propagate(*v, C=C)
        return lower, upper

    def bound_backward(self, last_lA, last_uA, *x, start_node=None, **kwargs):
        assert len(x) == 2
        # Determine if two inputs should be swapped
        self.swap_x_and_weight = not self.is_input_perturbed(0) and self.is_input_perturbed(1)
        idx_weight = 0 if self.swap_x_and_weight else 1
        if start_node is not None:
            self._start = start_node.name
        results = list(super().bound_backward(last_lA, last_uA, *x, **kwargs))
        # Transpose weight-related tensors
        def transpose_weight(A_weight):
            return A_weight.transpose(-1, -2) if A_weight is not None else None
        results[0][idx_weight] = (transpose_weight(results[0][idx_weight][0]),
                                  transpose_weight(results[0][idx_weight][1]))
        if isinstance(results[1], tuple):
            lbias = (results[1][0], results[1][1].transpose(-1, -2))
        else:
            lbias = results[1]
        if isinstance(results[2], tuple):
            ubias = (results[2][0], results[2][1].transpose(-1, -2))
        else:
            ubias = results[2]
        # Reduce the broadcast dimensions
        lA_x = self.broadcast_backward(results[0][0][0], x[0])
        uA_x = self.broadcast_backward(results[0][0][1], x[0])
        lA_y = self.broadcast_backward(results[0][1][0], x[1])
        uA_y = self.broadcast_backward(results[0][1][1], x[1])
        return [(lA_x, uA_x), (lA_y, uA_y), results[0][2]], lbias, ubias

    def bound_forward(self, dim_in, x, y):
        def _bound_forward(x, y, weight_index=1):
            # We assume that x is perturbed and y is not perturbed (weight).
            weight_has_batch = (self.inputs[weight_index].batch_dim != -1)
            return super(BoundMatMul, self).bound_forward(dim_in, x, LinearBound(
                y.lw.transpose(-1, -2) if y.lw is not None else None,
                y.lb.transpose(-1, -2) if y.lb is not None else None,
                y.uw.transpose(-1, -2) if y.uw is not None else None,
                y.ub.transpose(-1, -2) if y.ub is not None else None,
                y.lower.transpose(-1, -2) if y.lower is not None else None,
                y.upper.transpose(-1, -2) if y.upper is not None else None
            ), weight_has_batch=weight_has_batch)
        
        # Check if we need to swap x and y
        if not self.is_input_perturbed(0) and self.is_input_perturbed(1):
            return _bound_forward(y, x, weight_index=0)
        else:
            return _bound_forward(x, y, weight_index=1)

    def update_requires_input_bounds(self):
        # If any multiplier is a constant, we do not need input bounds.
        self.is_linear_op = not self.inputs[1].perturbed or not self.inputs[0].perturbed
        if self.is_linear_op:
            # One input is constant; no bounds required.
            self.requires_input_bounds = []
            self.splittable = False
        else:
            # Both inputs are perturbed. Need relaxation.
            self.requires_input_bounds = [0, 1]
            if not self.force_not_splittable:
                self.splittable = True


class BoundNeg(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.ibp_intermediate = True

    def forward(self, x):
        return -x

    def bound_backward(self, last_lA, last_uA, x, **kwargs):
        if type(last_lA) == Tensor or type(last_uA) == Tensor:
            return [(-last_lA if last_lA is not None else None,
                 -last_uA if last_uA is not None else None)], 0, 0
        elif type(last_lA) == Patches or type(last_uA) == Patches:
            if last_lA is not None:
                lA = Patches(-last_lA.patches, last_lA.stride, last_lA.padding,
                             last_lA.shape, unstable_idx=last_lA.unstable_idx,
                             output_shape=last_lA.output_shape)
            else:
                lA = None

            if last_uA is not None:
                uA = Patches(-last_uA.patches, last_uA.stride, last_uA.padding,
                             last_uA.shape, unstable_idx=last_uA.unstable_idx,
                             output_shape=last_uA.output_shape)
            else:
                uA = None
            return [(lA, uA)], 0, 0
        else:
            raise NotImplementedError

    def bound_forward(self, dim_in, x):
        return LinearBound(-x.uw, -x.ub, -x.lw, -x.lb)

    def interval_propagate(self, *v):
        return -v[0][1], -v[0][0]

    def build_gradient_node(self, grad_upstream):
        return [(NegGrad(), (grad_upstream,), [])]


class NegGrad(Module):
    def forward(self, grad_last):
        return -grad_last


class BoundCumSum(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.use_default_ibp = True

    def forward(self, x, axis):
        self.axis = axis
        return torch.cumsum(x, axis)

class BoundIdentity(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.use_default_ibp = True

    def forward(self, x):
        return x

    def bound_backward(self, last_lA, last_uA, x, **kwargs):
        return [(last_lA, last_uA)], 0, 0

    def bound_forward(self, dim_in, x):
        return x


class LinearGrad(Module):
    def __init__(self, weight):
        super().__init__()
        self.weight = weight

    def forward(self, grad_last):
        weight = self.weight.to(grad_last).t()
        return F.linear(grad_last, weight)


class MatMulGrad(Module):
    def forward(self, grad_last, x):
        return grad_last.matmul(x.transpose(-1, -2))


================================================
FILE: auto_LiRPA/operators/logical.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
""" Logical operators"""
from .base import *


class BoundWhere(Bound):
    def forward(self, condition, x, y):
        return torch.where(condition.to(torch.bool), x, y)

    def interval_propagate(self, *v):
        assert not self.is_input_perturbed(0)
        condition = v[0][0]
        return tuple([torch.where(condition, v[1][j], v[2][j]) for j in range(2)])

    def bound_backward(self, last_lA, last_uA, condition, x, y, **kwargs):
        assert torch.allclose(condition.lower.float(), condition.upper.float())
        assert self.from_input
        mask = condition.lower.float()

        def _bound_oneside(last_A):
            if last_A is None:
                return None, None
            assert last_A.ndim > 1
            A_x = self.broadcast_backward(mask.unsqueeze(0) * last_A, x)
            A_y = self.broadcast_backward((1 - mask).unsqueeze(0) * last_A, y)
            return A_x, A_y

        lA_x, lA_y = _bound_oneside(last_lA)
        uA_x, uA_y = _bound_oneside(last_uA)

        return [(None, None), (lA_x, uA_x), (lA_y, uA_y)], 0, 0

class BoundNot(Bound):
    def forward(self, x):
        return x.logical_not()


class BoundEqual(Bound):
    def forward(self, x, y):
        return x == y


================================================
FILE: auto_LiRPA/operators/minmax.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
import torch
from .base import *
from .clampmult import multiply_by_A_signs
from .activation_base import BoundOptimizableActivation


class BoundMinMax(BoundOptimizableActivation):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.options = options
        self.requires_input_bounds = [0, 1]
        self.op = None

    def _init_opt_parameters_impl(self, size_spec, name_start):
        """Implementation of init_opt_parameters for each start_node."""
        l = self.inputs[0].lower
        # Alpha dimension is (2, output_shape, batch, *shape).
        shape = [2, size_spec] + list(l.shape)
        return torch.ones(shape, device=l.device)

    def clip_alpha(self):
        # See https://www.overleaf.com/read/jzgrcmqtqpcx#9dbf97 for the math behind this code.
        lb_x = self._cached_lb_x
        ub_x = self._cached_ub_x
        lb_y = self._cached_lb_y
        ub_y = self._cached_ub_y

        for v in self.alpha.values():
            eps = torch.tensor(1e-6).to(lb_x.dtype)
            if self.op == 'max':
                # Case 1: l_x >= u_y
                case1 = (lb_x >= ub_y).requires_grad_(False).to(lb_x.dtype)
                alpha_u_lb = torch.zeros_like(case1)
                alpha_u_ub = torch.zeros_like(case1)
                alpha_l_lb = torch.zeros_like(case1)
                alpha_l_ub = torch.zeros_like(case1)

                # Case 2: l_x < u_y && u_x > u_y
                case2 = ((lb_x < ub_y) * (ub_x > ub_y)).requires_grad_(False).to(lb_x.dtype)
                alpha_u_lb += case2 * (ub_x - ub_y) / (ub_x - torch.maximum(lb_x, lb_y))
                alpha_u_ub += case2
                alpha_l_ub += case2

                # Case 3: l_x < u_y && u_x == u_y
                case3 = ((lb_x < ub_y) * (ub_x == ub_y)).requires_grad_(False).to(lb_x.dtype)
                alpha_u_ub += case3
                alpha_l_ub += case3

                alpha_u_lb = torch.clamp(alpha_u_lb, min=eps)
                alpha_u_ub = torch.clamp(alpha_u_ub, min=eps)
            elif self.op == 'min':
                # Case 1: l_y >= u_x
                case1 = (lb_y >= ub_x).requires_grad_(False).to(lb_x.dtype)
                alpha_u_lb = torch.zeros_like(case1)
                alpha_u_ub = torch.zeros_like(case1)
                alpha_l_lb = torch.zeros_like(case1)
                alpha_l_ub = torch.zeros_like(case1)

                # Case 2: l_y < u_x && l_y > l_x
                case2 = ((lb_y < ub_x) * (lb_y > lb_x)).requires_grad_(False).to(lb_x.dtype)
                alpha_u_ub += case2
                alpha_l_lb += case2 * (lb_y - lb_x) / (torch.minimum(ub_x, ub_y) - lb_x)
                alpha_l_ub += case2

                # Case 3: l_y < u_x && l_y == l_x
                case3 = ((lb_y < ub_x) * (lb_y == lb_x)).requires_grad_(False).to(lb_x.dtype)
                alpha_u_ub += case3
                alpha_l_ub += case3

                alpha_l_lb = torch.clamp(alpha_l_lb, min=eps)
                alpha_l_ub = torch.clamp(alpha_l_ub, min=eps)

            v.data[0] = torch.clamp(v.data[0], alpha_u_lb, alpha_u_ub)
            v.data[1] = torch.clamp(v.data[1], alpha_l_lb, alpha_l_ub)

    def forward(self, x, y):
        if self.op == 'max':
            return torch.max(x, y)
        elif self.op == 'min':
            return torch.min(x, y)
        else:
            raise NotImplementedError

    def _backward_relaxation(self, x, y, start_node=None):
        # See https://www.overleaf.com/read/jzgrcmqtqpcx#9dbf97 for the math behind this code.

        lb_x = x.lower
        ub_x = x.upper
        lb_y = y.lower
        ub_y = y.upper

        if self.opt_stage in ['opt', 'reuse']:
            selected_alpha = self.alpha[start_node.name]
            alpha_u = selected_alpha[0]
            alpha_l = selected_alpha[1]
        else:
            alpha_u = alpha_l = 1

        ub_x = ub_x.unsqueeze(0)
        ub_y = ub_y.unsqueeze(0)
        lb_x = lb_x.unsqueeze(0)
        lb_y = lb_y.unsqueeze(0)

        if self.op == 'max':
            swapped_inputs = ub_x < ub_y
        elif self.op == 'min':
            swapped_inputs = lb_y < lb_x
        else:
            raise NotImplementedError
        lb_x, lb_y = torch.where(swapped_inputs, lb_y, lb_x), torch.where(swapped_inputs, lb_x, lb_y)
        ub_x, ub_y = torch.where(swapped_inputs, ub_y, ub_x), torch.where(swapped_inputs, ub_x, ub_y)

        self._cached_lb_x = lb_x.detach()
        self._cached_ub_x = ub_x.detach()
        self._cached_lb_y = lb_y.detach()
        self._cached_ub_y = ub_y.detach()

        epsilon = 1e-6
        ub_x = torch.max(ub_x, lb_x + epsilon)
        ub_y = torch.max(ub_y, lb_y + epsilon)
        # Ideally, if x or y are constant, this layer should be replaced by a ReLU
        # max{x, c} = max{x − c, 0} + c
        # min{x, c} = −max{−x, −c} = −(max{−x + c, 0} − c) = −max{−x + c, 0} + c
        if torch.any(lb_x + 1e-4 >= ub_x) or torch.any(lb_y + 1e-4 >= ub_y):
            print("Warning: MinMax layer (often used for clamping) received at "
                  "least one input with lower bound almost equal to the upper "
                  "bound. This can happen e.g. if x or y are constants. Consider "
                  "replacing this layer with a ReLU for higher efficieny.")
        assert torch.all(ub_x != lb_x) and torch.all(ub_y != lb_y), (
            'Lower/upper bounds are too close and epsilon was rounded away. '
            'To fix this, increase epsilon.'
        )

        if isinstance(alpha_u, torch.Tensor):
            assert alpha_u.shape[1:] == ub_x.shape[1:]
            shape = alpha_u.shape
        else:
            shape = ub_x.shape
        upper_dx = torch.zeros(shape, device=ub_x.device)
        upper_dy = torch.zeros(shape, device=ub_x.device)
        lower_dx = torch.zeros(shape, device=ub_x.device)
        lower_dy = torch.zeros(shape, device=ub_x.device)
        upper_b = torch.zeros(shape, device=ub_x.device)
        lower_b = torch.zeros(shape, device=ub_x.device)
        if self.op == 'max':
            # Case 1: l_x >= u_y
            case1 = (lb_x >= ub_y).requires_grad_(False).to(lb_x.dtype)
            upper_dx += case1
            lower_dx += case1

            # Case 2: l_x < u_y && u_x > u_y
            case2 = ((lb_x < ub_y) * (ub_x > ub_y)).requires_grad_(False).to(lb_x.dtype)
            upper_dx = upper_dx + case2 * (ub_y - ub_x) / (alpha_u * (lb_x - ub_x))
            upper_dy = upper_dy + case2 * (alpha_u - 1) * (ub_y - ub_x) / (alpha_u * (ub_y - lb_y))
            upper_b = upper_b + case2 * (ub_x - (ub_x * (ub_y - ub_x)) / (alpha_u * (lb_x - ub_x))
                                - ((alpha_u - 1) * (ub_y - ub_x) * lb_y) / (alpha_u * (ub_y - lb_y)))
            lower_dx = lower_dx + case2 * (1 - alpha_l)
            lower_dy = lower_dy + case2 * alpha_l

            # Case 3: l_x < u_y && u_x == u_y
            case3 = ((lb_x < ub_y) * (ub_x == ub_y)).requires_grad_(False).to(lb_x.dtype)
            upper_dx = upper_dx + case3 * alpha_u * (ub_x - torch.maximum(lb_x, lb_y)) / (ub_x - lb_x)
            upper_dy = upper_dy + case3 * alpha_u * (ub_x - torch.maximum(lb_x, lb_y)) / (ub_y - lb_y)
            upper_b = upper_b + case3 * (ub_x -
                        (alpha_u * (ub_x - torch.maximum(lb_x, lb_y)) * lb_x) / (ub_x - lb_x) -
                        (alpha_u * (ub_x - torch.maximum(lb_x, lb_y)) * ub_y) / (ub_y - lb_y))
            lower_dx = lower_dx + case3 * (1 - alpha_l)
            lower_dy = lower_dy + case3 * alpha_l
        elif self.op == 'min':
            # Case 1: l_y >= u_x
            case1 = (lb_y >= ub_x).requires_grad_(False).to(lb_x.dtype)
            upper_dx = case1.clone()
            lower_dx = case1.clone()
            upper_dy = torch.zeros_like(case1)
            lower_dy = torch.zeros_like(case1)
            upper_b = torch.zeros_like(case1)
            lower_b = torch.zeros_like(case1)

            # Case 2: l_y < u_x && l_y > l_x
            case2 = ((lb_y < ub_x) * (lb_y > lb_x)).requires_grad_(False).to(lb_x.dtype)
            upper_dx = upper_dx + case2 * (1 - alpha_u)
            upper_dy = upper_dy + case2 * alpha_u
            lower_dx = lower_dx + case2 * (lb_x - lb_y) / (alpha_l * (lb_x - ub_x))
            lower_dy = lower_dy + case2 * (alpha_l - 1) * (lb_x - lb_y) / (alpha_l * (ub_y - lb_y))
            lower_b = lower_b + case2 * (lb_y - (ub_x * (lb_x - lb_y)) / (alpha_l * (lb_x - ub_x))
                                - ((alpha_l - 1) * (lb_x - lb_y) * lb_y) / (alpha_l * (ub_y - lb_y)))

            # Case 3: l_y < u_x && l_y == l_x
            case3 = ((lb_y < ub_x) * (lb_y == lb_x)).requires_grad_(False).to(lb_x.dtype)
            upper_dx = upper_dx + case3 * (1 - alpha_u)
            upper_dy = upper_dy + case3 * alpha_u
            lower_dx = lower_dx + case3 * alpha_l * (torch.minimum(ub_x, ub_y) - lb_x) / (ub_x - lb_x)
            lower_dy = lower_dy + case3 * alpha_l * (torch.minimum(ub_x, ub_y) - lb_x) / (ub_y - lb_y)
            lower_b = lower_b + case3 * (lb_x -
                        (alpha_l * (torch.minimum(ub_x, ub_y) - lb_x) * lb_x) / (ub_x - lb_x) -
                        (alpha_l * (torch.minimum(ub_x, ub_y) - lb_x) * ub_y) / (ub_y - lb_y))
        else:
            raise NotImplementedError

        lower_dx, lower_dy = torch.where(swapped_inputs, lower_dy, lower_dx), torch.where(swapped_inputs, lower_dx, lower_dy)
        upper_dx, upper_dy = torch.where(swapped_inputs, upper_dy, upper_dx), torch.where(swapped_inputs, upper_dx, upper_dy)

        return upper_dx, upper_dy, upper_b, lower_dx, lower_dy, lower_b

    def bound_backward(self, last_lA, last_uA, x=None, y=None, start_shape=None,
                       start_node=None, **kwargs):
        # Get element-wise CROWN linear relaxations.
        upper_dx, upper_dy, upper_b, lower_dx, lower_dy, lower_b = \
            self._backward_relaxation(x, y, start_node)

        # Choose upper or lower bounds based on the sign of last_A
        def _bound_oneside(last_A, d_pos, d_neg, b_pos, b_neg):
            if last_A is None:
                return None, 0
            # Obtain the new linear relaxation coefficients based on the signs in last_A.
            _A, _bias = multiply_by_A_signs(last_A, d_pos, d_neg, b_pos, b_neg)
            if isinstance(last_A, Patches):
                # Save the patch size, which will be used in init_slope() to determine the number of optimizable parameters.
                A_prod = _A.patches
                if start_node is not None:
                    # Regular patches.
                    self.patch_size[start_node.name] = A_prod.size()
            return _A, _bias

        # In patches mode we might need an unfold.
        # lower_dx, lower_dy, upper_dx, upper_dy, lower_b, upper_b: 1, batch, current_c, current_w, current_h or None
        # In _backward_relaxation, the lb_x etc. potentially got swapped. This may cause the memory to become
        # non-contiguous. This is not a problem if the spec_size is 1, e.g. if alphas are shared.
        upper_dx = upper_dx.contiguous()
        upper_dy = upper_dy.contiguous()
        lower_dx = lower_dx.contiguous()
        lower_dy = lower_dy.contiguous()
        upper_b = upper_b.contiguous()
        lower_b = lower_b.contiguous()


        upper_dx = maybe_unfold_patches(upper_dx, last_lA if last_lA is not None else last_uA)
        upper_dy = maybe_unfold_patches(upper_dy, last_lA if last_lA is not None else last_uA)
        lower_dx = maybe_unfold_patches(lower_dx, last_lA if last_lA is not None else last_uA)
        lower_dy = maybe_unfold_patches(lower_dy, last_lA if last_lA is not None else last_uA)
        upper_b = maybe_unfold_patches(upper_b, last_lA if last_lA is not None else last_uA)
        lower_b = maybe_unfold_patches(lower_b, last_lA if last_lA is not None else last_uA)

        uAx, ubias = _bound_oneside(last_uA, upper_dx, lower_dx, upper_b, lower_b)
        uAy, ubias2 = _bound_oneside(last_uA, upper_dy, lower_dy, upper_b, lower_b)
        if isinstance(ubias, torch.Tensor):
            assert isinstance(ubias2, torch.Tensor)
            assert torch.all(ubias == ubias2)
        else:
            assert ubias == ubias2 == 0
        lAx, lbias = _bound_oneside(last_lA, lower_dx, upper_dx, lower_b, upper_b)
        lAy, lbias2 = _bound_oneside(last_lA, lower_dy, upper_dy, lower_b, upper_b)
        if isinstance(lbias, torch.Tensor):
            assert isinstance(lbias2, torch.Tensor)
            assert torch.all(lbias == lbias2)
        else:
            assert lbias == lbias2 == 0

        return [(lAx, uAx), (lAy, uAy)], lbias, ubias

    def interval_propagate(self, *v):
        h_Lx, h_Ux = v[0][0], v[0][1]
        h_Ly, h_Uy = v[1][0], v[1][1]
        return self.forward(h_Lx, h_Ly), self.forward(h_Ux, h_Uy)


class BoundMax(BoundMinMax):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.op = 'max'


class BoundMin(BoundMinMax):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.op = 'min'


================================================
FILE: auto_LiRPA/operators/normalization.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
""" Normalization operators"""
import copy

import torch
import torch.nn as nn

from .base import *
from .constant import BoundConstant
from .leaf import BoundParams
from .solver_utils import grb


class BoundBatchNormalization(Bound):
    def __init__(self, attr, inputs, output_index, options, training):
        super().__init__(attr, inputs, output_index, options)
        self.eps = attr['epsilon']
        self.momentum = round(1 - attr['momentum'], 5)  # take care!
        self.options = options.get("bn", {})
        # modes:
        #   - forward: use mean and variance estimated from clean forward pass
        #   - ibp: use mean and variance estimated from ibp
        self.bn_mode = self.options.get("mode", "forward")
        self.use_mean = self.options.get("mean", True)
        self.use_var = self.options.get("var", True)
        self.use_affine = self.options.get("affine", True)
        self.training = training
        self.patches_start = True
        self.mode = options.get("conv_mode", "matrix")
        if not self.use_mean or not self.use_var:
            logger.info(f'Batch normalization node {self.name}: use_mean {self.use_mean}, use_var {self.use_var}')

    def _check_unused_mean_or_var(self):
        # Check if either mean or var is opted out
        if not self.use_mean:
            self.current_mean = torch.zeros_like(self.current_mean)
        if not self.use_var:
            self.current_var = torch.ones_like(self.current_var)

    def forward(self, x, w, b, m, v):
        if len(x.shape) == 2:
            self.patches_start = False
        if self.training:
            dim = [0] + list(range(2, x.ndim))
            self.current_mean = x.mean(dim)
            self.current_var = x.var(dim, unbiased=False)
        else:
            self.current_mean = m.data
            self.current_var = v.data
        self._check_unused_mean_or_var()
        if not self.use_affine:
            w = torch.ones_like(w)
            b = torch.zeros_like(b)
        result = F.batch_norm(x, m, v, w, b, self.training, self.momentum, self.eps)
        if not self.use_mean or not self.use_var:
            # If mean or variance is disabled, recompute the output from self.current_mean
            # and self.current_var instead of using standard F.batch_norm.
            w = w / torch.sqrt(self.current_var + self.eps)
            b = b - self.current_mean * w
            shape = (1, -1) + (1,) * (x.ndim - 2)
            result = w.view(*shape) * x + b.view(*shape)
        return result

    def bound_forward(self, dim_in, *x):
        inp = x[0]
        assert (x[1].lower == x[1].upper).all(), "unsupported forward bound with perturbed mean"
        assert (x[2].lower == x[2].upper).all(), "unsupported forward bound with perturbed var"
        weight, bias = x[1].lower, x[2].lower
        if not self.training:
            assert (x[3].lower == x[3].upper).all(), "unsupported forward bound with perturbed mean"
            assert (x[4].lower == x[4].upper).all(), "unsupported forward bound with perturbed var"
            self.current_mean = x[3].lower
            self.current_var = x[4].lower
        self._check_unused_mean_or_var()
        if not self.use_affine:
            weight = torch.ones_like(weight)
            bias = torch.zeros_like(bias)

        tmp_bias = bias - self.current_mean / torch.sqrt(self.current_var + self.eps) * weight
        tmp_weight = weight / torch.sqrt(self.current_var + self.eps)

        tmp_weight = tmp_weight.view(*((1, 1, -1) + (1,) * (inp.lw.ndim - 3)))
        new_lw = torch.clamp(tmp_weight, min=0.) * inp.lw + torch.clamp(tmp_weight, max=0.) * inp.uw
        new_uw = torch.clamp(tmp_weight, min=0.) * inp.uw + torch.clamp(tmp_weight, max=0.) * inp.lw

        tmp_weight = tmp_weight.view(*((1, -1) + (1,) * (inp.lb.ndim - 2)))
        tmp_bias = tmp_bias.view(*((1, -1) + (1,) * (inp.lb.ndim - 2)))
        new_lb = torch.clamp(tmp_weight, min=0.) * inp.lb + torch.clamp(tmp_weight, max=0.) * inp.ub + tmp_bias
        new_ub = torch.clamp(tmp_weight, min=0.) * inp.ub + torch.clamp(tmp_weight, max=0.) * inp.lb + tmp_bias

        return LinearBound(
            lw = new_lw,
            lb = new_lb,
            uw = new_uw,
            ub = new_ub)

    def bound_backward(self, last_lA, last_uA, *x, **kwargs):
        assert not self.is_input_perturbed(1) and not self.is_input_perturbed(2), \
            'Weight perturbation is not supported for BoundBatchNormalization'

        def get_param(p):
            if isinstance(p, BoundConstant):
                # When affine is disabled in BN
                return p.value
            elif isinstance(p, BoundParams):
                return p.param
            else:
                raise TypeError(p)

        # x[0]: input, x[1]: weight, x[2]: bias, x[3]: running_mean, x[4]: running_var
        weight = get_param(x[1])
        bias = get_param(x[2])
        if not self.training:
            self.current_mean = x[3].value
            self.current_var = x[4].value
        self._check_unused_mean_or_var()
        if not self.use_affine:
            weight = torch.ones_like(weight)
            bias = torch.zeros_like(bias)

        tmp_bias = bias - self.current_mean / torch.sqrt(self.current_var + self.eps) * weight
        tmp_weight = weight / torch.sqrt(self.current_var + self.eps)

        def _bound_oneside(last_A):
            if last_A is None:
                return None, 0
            if type(last_A) == Tensor:
                next_A = last_A * tmp_weight.view(*((1, 1, -1) + (1,) * (last_A.ndim - 3)))
                if last_A.ndim > 3:
                    sum_bias = (last_A.sum(tuple(range(3, last_A.ndim))) * tmp_bias).sum(2)
                else:
                    sum_bias = (last_A * tmp_bias).sum(2)
            elif type(last_A) == Patches:
                # TODO Only 4-dim BN supported in the Patches mode
                if last_A.identity == 0:
                    # FIXME (09/17): Need to check if it has already been padding.
                    # Patch has dimension (out_c, batch, out_h, out_w, c, h, w) or (unstable_size, batch, c, h, w)
                    patches = last_A.patches

                    # tmp_weight has shape (c,), it will be applied on the (c,) dimension.
                    patches = patches * tmp_weight.view(*([1] * (patches.ndim - 3)), -1, 1, 1)  # Match with sparse or non-sparse patches.
                    next_A = last_A.create_similar(patches)

                    # bias to size (c,), need expansion before unfold.
                    bias = tmp_bias.view(-1,1,1).expand(self.input_shape[1:]).unsqueeze(0)
                    # Unfolded bias has shape (1, out_h, out_w, in_c, H, W).
                    bias_unfolded = inplace_unfold(bias, kernel_size=last_A.patches.shape[-2:], padding=last_A.padding, stride=last_A.stride,
                            inserted_zeros=last_A.inserted_zeros, output_padding=last_A.output_padding)
                    if last_A.unstable_idx is not None:
                        # Sparse bias has shape (unstable_size, batch, in_c, H, W).
                        bias_unfolded = bias_unfolded[:, last_A.unstable_idx[1], last_A.unstable_idx[2]]
                        sum_bias = torch.einsum('bschw,sbchw->sb', bias_unfolded, last_A.patches)
                        # Output sum_bias has shape (unstable_size, batch).
                    else:
                        # Patch has dimension (out_c, batch, out_h, out_w, c, h, w).
                        sum_bias = torch.einsum('bijchw,sbijchw->sbij', bias_unfolded, last_A.patches)
                        # Output sum_bias has shape (out_c, batch, out_h, out_w).
                else:
                    # we should create a real identity Patch
                    num_channel = tmp_weight.numel()
                    # desired Shape is (c, batch, out_w, out_h, c, 1, 1) or (unstable_size, batch, c, 1, 1).
                    patches = (torch.eye(num_channel, device=tmp_weight.device) * tmp_weight.view(-1)).view(num_channel, 1, 1, 1, num_channel, 1, 1)
                    # Expand out_h, out_w dimensions but not for batch dimension.
                    patches = patches.expand(-1, -1, last_A.output_shape[2], last_A.output_shape[3], -1, 1, 1)
                    if last_A.unstable_idx is not None:
                        # Select based on unstable indices.
                        patches = patches[last_A.unstable_idx[0], :, last_A.unstable_idx[1], last_A.unstable_idx[2]]
                    # Expand the batch dimension.
                    patches = patches.expand(-1, last_A.shape[1], *([-1] * (patches.ndim - 2)))
                    next_A = last_A.create_similar(patches, stride=1, padding=0, identity=0)
                    if last_A.unstable_idx is not None:
                        # Need to expand the bias and choose the selected ones.
                        bias = tmp_bias.view(-1,1,1,1).expand(-1, 1, last_A.output_shape[2], last_A.output_shape[3])
                        bias = bias[last_A.unstable_idx[0], :, last_A.unstable_idx[1], last_A.unstable_idx[2]]
                        # Expand the batch dimension, and final output shape is (unstable_size, batch).
                        sum_bias = bias.expand(-1, last_A.shape[1])
                    else:
                        # Output sum_bias has shape (out_c, batch, out_h, out_w).
                        sum_bias = tmp_bias.view(-1, 1, 1, 1).expand(-1, *last_A.shape[1:4])
            else:
                raise NotImplementedError()
            return next_A, sum_bias

        lA, lbias = _bound_oneside(last_lA)
        uA, ubias = _bound_oneside(last_uA)

        return [(lA, uA), (None, None), (None, None), (None, None), (None, None)], lbias, ubias

    def interval_propagate(self, *v):
        assert not self.is_input_perturbed(1) and not self.is_input_perturbed(2), \
            'Weight perturbation is not supported for BoundBatchNormalization'

        h_L, h_U = v[0]
        weight, bias = v[1][0], v[2][0]

        mid = (h_U + h_L) / 2.0
        diff = (h_U - h_L) / 2.0

        # Use `mid` in IBP to compute mean and variance for BN.
        # In this case, `forward` should not have been called.
        if self.bn_mode == 'ibp' and not hasattr(self, 'forward_value'):
            m, v, w, b = tuple(self.inputs[i].forward() for i in range(1, 5))
            self.forward(mid, m, v, w, b)

        if not self.training:
            assert not (self.is_input_perturbed(3) or self.is_input_perturbed(4))
            self.current_mean = v[3][0]
            self.current_var = v[4][0]
        self._check_unused_mean_or_var()
        if not self.use_affine:
            weight = torch.ones_like(weight)
            bias = torch.zeros_like(bias)

        tmp_weight = weight / torch.sqrt(self.current_var + self.eps)
        tmp_weight_abs = tmp_weight.abs()
        tmp_bias = bias - self.current_mean * tmp_weight
        shape = (1, -1) + (1,) * (mid.ndim - 2)

        # interval_propagate() of the Linear layer may encounter input with different norms.
        norm, eps = Interval.get_perturbation(v[0])[:2]
        if norm == torch.inf:
            center = tmp_weight.view(*shape) * mid + tmp_bias.view(*shape)
            deviation = tmp_weight_abs.view(*shape) * diff
        elif norm > 0:
            mid = v[0][0]
            center = tmp_weight.view(*shape) * mid + tmp_bias.view(*shape)
            if norm == 2:
                ptb = copy.deepcopy(v[0].ptb)
                ptb.eps = eps * tmp_weight_abs.max()
                return Interval(center, center, ptb=ptb)
            else:
                # General Lp norm.
                center = tmp_weight.view(*shape) * mid
                deviation = tmp_weight_abs.view(*shape) * eps  # use a Linf ball to replace Lp norm
        else:
            raise NotImplementedError

        lower, upper = center - deviation, center + deviation

        return lower, upper

    def build_solver(self, *v, model, C=None, model_type="mip", solver_pkg="gurobi"):
        # e.g., last layer input gurobi vars (3,32,32)
        gvars_array = np.array(v[0])
        # pre_layer_shape (1,3,32,32)
        pre_layer_shape = np.expand_dims(gvars_array, axis=0).shape
        # this layer shape (1,8,16,16)
        this_layer_shape = self.output_shape

        weight, bias = v[1], v[2]

        self.current_mean = v[3]
        self.current_var = v[4]
        self._check_unused_mean_or_var()
        if not self.use_affine:
            weight = torch.ones_like(weight)
            bias = torch.zeros_like(bias)

        tmp_bias = bias - self.current_mean / torch.sqrt(self.current_var + self.eps) * weight
        tmp_weight = weight / torch.sqrt(self.current_var + self.eps)

        new_layer_gurobi_vars = []
        neuron_idx = 0
        for out_chan_idx in range(this_layer_shape[1]):
            out_chan_vars = []
            for out_row_idx in range(this_layer_shape[2]):
                out_row_vars = []
                for out_col_idx in range(this_layer_shape[3]):
                    # print(this_layer_bias.shape, out_chan_idx, out_lbs.size(1))
                    lin_expr = tmp_bias[out_chan_idx].item() + tmp_weight[out_chan_idx].item() * gvars_array[out_chan_idx, out_row_idx, out_col_idx]
                    var = model.addVar(lb=-float('inf'), ub=float('inf'),
                                            obj=0, vtype=grb.GRB.CONTINUOUS,
                                            name=f'lay{self.name}_{neuron_idx}')
                    model.addConstr(lin_expr == var, name=f'lay{self.name}_{neuron_idx}_eq')
                    neuron_idx += 1

                    out_row_vars.append(var)
                out_chan_vars.append(out_row_vars)
            new_layer_gurobi_vars.append(out_chan_vars)

        self.solver_vars = new_layer_gurobi_vars
        model.update()

    def update_requires_input_bounds(self):
        self._check_weight_perturbation()


class LayerNormImpl(nn.Module):
    def __init__(self, axis, epsilon):
        super().__init__()
        self.axis = axis
        self.epsilon = epsilon

    def forward(self, x, scale, bias):
        mean = x.mean(self.axis, keepdim=True)
        d = x - mean
        dd = d**2
        var = dd.mean(self.axis, keepdim=True)
        var_eps = var + self.epsilon
        std_dev = torch.sqrt(var_eps)
        inv_std_dev = torch.reciprocal(std_dev)
        normalized = d * inv_std_dev
        normalized_scaled = normalized * scale + bias
        return normalized_scaled


class BoundLayerNormalization(Bound):
    def __init__(self, attr, inputs, output_index, options):
        super().__init__(attr, inputs, output_index, options)
        self.complex = True
        self.model = LayerNormImpl(self.attr['axis'], self.attr['epsilon'])

    def forward(self, x, scale, bias):
        self.input = (x, scale, bias)
        return self.model(x, scale, bias)


================================================
FILE: auto_LiRPA/operators/pooling.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
"""Pooling operators."""
from collections import OrderedDict
from .base import *
from .activation_base import BoundOptimizableActivation
import numpy as np
from .solver_utils import grb


class BoundMaxPool(BoundOptimizableActivation):

    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        assert ('pads' not in attr) or (attr['pads'][0] == attr['pads'][2])
        assert ('pads' not in attr) or (attr['pads'][1] == attr['pads'][3])

        self.requires_input_bounds = [0]
        self.kernel_size = attr['kernel_shape']
        self.stride = attr['strides']
        self.padding = [attr['pads'][0], attr['pads'][1]]
        self.ceil_mode = False
        self.use_default_ibp = True
        self.alpha = {}
        self.init = {}

    def forward(self, x):
        output, _ = F.max_pool2d(x, self.kernel_size, self.stride, self.padding,
                                 return_indices=True, ceil_mode=self.ceil_mode)
        return output

    def project_simplex(self, patches):
        sorted = torch.flatten(patches, -2)
        sorted, _ = torch.sort(sorted, -1, descending=True)
        rho_sum = torch.cumsum(sorted, -1)
        rho_value = 1 - rho_sum
        rho_value = (sorted + rho_value/torch.tensor(
            range(1, sorted.size(-1)+1), dtype=torch.float,
            device=sorted.device)) > 0
        _, rho_index = torch.max(torch.cumsum(rho_value, -1), -1)
        rho_sum = torch.gather(rho_sum, -1, rho_index.unsqueeze(-1)).squeeze(-1)
        lbd = 1/(rho_index+1)* (1-rho_sum)

        return torch.clamp(patches + lbd.unsqueeze(-1).unsqueeze(-1), min=0)

    def _init_opt_parameters_impl(self, size_spec, name_start):
        if name_start == '_forward':
            warnings.warn("MaxPool's optimization is not supported for forward mode")
            return None
        ref = self.inputs[0].lower # a reference variable for getting the shape
        alpha = torch.empty(
            [1, size_spec, self.input_shape[0], self.input_shape[1],
            self.output_shape[-2], self.output_shape[-1],
            self.kernel_size[0], self.kernel_size[1]],
            dtype=torch.float, device=ref.device, requires_grad=True)
        self.init[name_start] = False
        return alpha

    @staticmethod
    @torch.jit.script
    def jit_mutiply(Apos, Aneg, pos, neg):
        return pos.contiguous() * Apos + neg.contiguous() * Aneg

    def bound_backward(self, last_lA, last_uA, x, start_node=None,
                       unstable_idx=None, **kwargs):
        # self.padding is a tuple of two elements: (height dimension padding, width dimension padding).
        paddings = tuple((self.padding[0], self.padding[0], self.padding[1], self.padding[1]))

        if self.stride[0] != self.kernel_size[0]:
            raise ValueError("self.stride ({}) != self.kernel_size ({})".format(self.stride, self.kernel_size))

        shape = self.input_shape
        batch_size = x.lower.shape[0]
        shape = list(shape[:-2]) + [a + 2*b for a, b in zip(self.input_shape[-2:], self.padding)]
        shape[0] = batch_size
        # Lower and upper D matrices. They have size (batch_size, input_c, x, y) which will be multiplied on enlarges the A matrices via F.interpolate.
        upper_d = torch.zeros(shape, device=x.device)
        lower_d = None

        # Size of upper_b and lower_b: (batch_size, output_c, h, w).
        upper_b = torch.zeros(batch_size, *self.output_shape[1:], device=x.device)
        lower_b = torch.zeros(batch_size, *self.output_shape[1:], device=x.device)

        # Find the maxpool neuron whose input bounds satisfy l_i > max_j u_j for all j != i. In this case, the maxpool neuron is linear, and we can set upper_d = lower_d = 1.
        # We first find which indices has the largest lower bound.
        max_lower, max_lower_index = F.max_pool2d(
            x.lower, self.kernel_size, self.stride, self.padding,
            return_indices=True, ceil_mode=self.ceil_mode)
        # Set the upper bound of the i-th input to -inf so it will not be selected as the max.

        if paddings == (0,0,0,0):
            delete_upper = torch.scatter(
                torch.flatten(x.upper, -2), -1,
                torch.flatten(max_lower_index, -2), -torch.inf).view(upper_d.shape)
        else:
            delete_upper = torch.scatter(
                torch.flatten(F.pad(x.upper, paddings), -2), -1,
                torch.flatten(max_lower_index, -2),
                -torch.inf).view(upper_d.shape)
        # Find the the max upper bound over the remaining ones.
        max_upper, _ = F.max_pool2d(
            delete_upper, self.kernel_size, self.stride, 0,
            return_indices=True, ceil_mode=self.ceil_mode)

        # The upper bound slope for maxpool is either 1 on input satisfies l_i > max_j u_j (linear), or 0 everywhere. Upper bound is not optimized.
        values = torch.zeros_like(max_lower)
        values[max_lower >= max_upper] = 1.0
        upper_d = torch.scatter(
            torch.flatten(upper_d, -2), -1,
            torch.flatten(max_lower_index, -2),
            torch.flatten(values, -2)).view(upper_d.shape)

        if self.opt_stage == 'opt':
            if unstable_idx is not None and self.alpha[start_node.name].size(1) != 1:
                if isinstance(unstable_idx, tuple):
                    raise NotImplementedError('Please use --conv_mode matrix')
                elif unstable_idx.ndim == 1:
                    # Only unstable neurons of the start_node neurons are used.
                    alpha = self.non_deter_index_select(
                        self.alpha[start_node.name], index=unstable_idx, dim=1)
                elif unstable_idx.ndim == 2:
                    # Each element in the batch selects different neurons.
                    alpha = batched_index_select(
                        self.alpha[start_node.name], index=unstable_idx, dim=1)
                else:
                    raise ValueError
            else:
                alpha = self.alpha[start_node.name]

            if not self.init[start_node.name]:
                lower_d = torch.zeros((shape), device=x.device)
                # [batch, C, H, W]
                lower_d = torch.scatter(
                    torch.flatten(lower_d, -2), -1,
                    torch.flatten(max_lower_index, -2), 1.0).view(upper_d.shape)
                # shape [batch, C*k*k, L]
                lower_d_unfold = F.unfold(
                    lower_d, self.kernel_size, 1, stride=self.stride)

                # [batch, C, k, k, out_H, out_W]
                alpha_data = lower_d_unfold.view(
                    lower_d.shape[0], lower_d.shape[1], self.kernel_size[0],
                    self.kernel_size[1], self.output_shape[-2], self.output_shape[-1])

                # [batch, C, out_H, out_W, k, k]
                alpha.data.copy_(alpha_data.permute((0,1,4,5,2,3)).clone().detach())
                self.init[start_node.name] = True
                # In optimization mode, we use the same lower_d once builded.
                if self.padding[0] > 0 or self.padding[1] > 0:
                    lower_d = lower_d[...,self.padding[0]:-self.padding[0],
                                      self.padding[1]:-self.padding[1]]
            # The lower bound coefficients must be positive and projected to an unit simplex.
            alpha.data = self.project_simplex(alpha.data).clone().detach()  # TODO: don't do this, never re-assign the .data property. Use copy_ instead.
            # permute the last 6 dimensions of alpha to [batch, C, k, k, out_H, out_W], which prepares for the unfold operation.
            alpha = alpha.permute((0,1,2,3,6,7,4,5))
            alpha_shape = alpha.shape
            alpha = alpha.reshape((alpha_shape[0]*alpha_shape[1]*alpha_shape[2],
                                   -1, alpha_shape[-2]*alpha_shape[-1]))
            lower_d = F.fold(alpha, self.input_shape[-2:], self.kernel_size, 1,
                             self.padding, self.stride)
            lower_d = lower_d.view(alpha_shape[0], alpha_shape[1],
                                   alpha_shape[2], *lower_d.shape[1:])
            lower_d = lower_d.squeeze(0)
        else:
            lower_d = torch.zeros((shape), device=x.device)
            # Not optimizable bounds. We simply set \hat{z} >= z_i where i is the input element with largest lower bound.
            lower_d = torch.scatter(torch.flatten(lower_d, -2), -1,
                                    torch.flatten(max_lower_index, -2),
                                    1.0).view(upper_d.shape)
            if self.padding[0] > 0 or self.padding[1] > 0:
                lower_d = lower_d[...,self.padding[0]:-self.padding[0],
                                  self.padding[1]:-self.padding[1]]

        # For the upper bound, we set the bias term to concrete upper bounds for maxpool neurons that are not linear.
        max_upper_, _ = F.max_pool2d(x.upper, self.kernel_size, self.stride,
                                     self.padding, return_indices=True,
                                     ceil_mode=self.ceil_mode)
        upper_b[max_upper > max_lower] = max_upper_[max_upper > max_lower]

        def _bound_oneside(last_A, d_pos, d_neg, b_pos, b_neg):
            if last_A is None:
                return None, 0

            bias = 0

            if isinstance(last_A, torch.Tensor):
                pos_A = last_A.clamp(min=0)
                neg_A = last_A.clamp(max=0)

                if b_pos is not None:
                    # This is matrix mode, and padding is considered in the previous layers
                    bias = bias + self.get_bias(pos_A, b_pos)
                if b_neg is not None:
                    bias = bias + self.get_bias(neg_A, b_neg)

                # Here we should comfirm that the maxpool patches are not overlapped.
                shape = last_A.size()

                padding = [self.padding[0], self.padding[0], self.padding[1], self.padding[1]]
                d_pos = F.pad(d_pos, padding)
                d_neg = F.pad(d_neg, padding)

                pos_A = F.interpolate(
                    pos_A.view(shape[0] * shape[1], *shape[2:]),
                    scale_factor=self.kernel_size)
                if d_pos.shape[-2] > pos_A.shape[-2] or d_pos.shape[-1] > pos_A.shape[-1]:
                    if not (d_pos.shape[-2] > pos_A.shape[-2] and d_pos.shape[-1] > pos_A.shape[-1]):
                        raise NotImplementedError(
                            "Asymmetric padding of maxpool not implemented.")
                    pos_A = F.pad(pos_A, (0, d_pos.shape[-2] - pos_A.shape[-2],
                                          0, d_pos.shape[-1] - pos_A.shape[-1]))
                else:
                    d_pos = F.pad(d_pos, (0, pos_A.shape[-2] - d_pos.shape[-2],
                                          0, pos_A.shape[-1] - d_pos.shape[-1]))
                pos_A = pos_A.view(shape[0], shape[1], *pos_A.shape[1:])

                neg_A = F.interpolate(neg_A.view(shape[0] * shape[1], *shape[2:]),
                                      scale_factor=self.kernel_size)
                if d_neg.shape[-2] > neg_A.shape[-2] or d_neg.shape[-1] > neg_A.shape[-1]:
                    if not (d_neg.shape[-2] > neg_A.shape[-2] and d_neg.shape[-1] > neg_A.shape[-1]):
                        raise NotImplementedError("Asymmetric padding of maxpool not implemented.")
                    neg_A = F.pad(neg_A, (0, d_neg.shape[-2] - neg_A.shape[-2],
                                          0, d_neg.shape[-1] - neg_A.shape[-1]))
                else:
                    d_neg = F.pad(d_neg, (0, neg_A.shape[-2] - d_neg.shape[-2],
                                          0, neg_A.shape[-1] - d_neg.shape[-1]))
                neg_A = neg_A.view(shape[0], shape[1], *neg_A.shape[1:])

                next_A = self.jit_mutiply(pos_A, neg_A, d_pos, d_neg)
                if self.padding[0] > 0 or self.padding[1] > 0:
                    next_A = next_A[...,self.padding[0]:-self.padding[0],
                                    self.padding[1]:-self.padding[1]]
            elif isinstance(last_A, Patches):
                # The last_A.patches was not padded, so we need to pad them here.
                # If this Conv layer is followed by a ReLU layer, then the padding was already handled there and there is no need to pad again.
                one_d = torch.ones(tuple(1 for i in self.output_shape[1:]),
                                   device=last_A.patches.device, dtype=last_A.patches.dtype).expand(self.output_shape[1:])
                # Add batch dimension.
                one_d = one_d.unsqueeze(0)
                # After unfolding, the shape is (1, out_h, out_w, in_c, h, w)
                one_d_unfolded = inplace_unfold(
                    one_d, kernel_size=last_A.patches.shape[-2:],
                    stride=last_A.stride, padding=last_A.padding,
                    inserted_zeros=last_A.inserted_zeros,
                    output_padding=last_A.output_padding)
                if last_A.unstable_idx is not None:
                    # Move out_h, out_w dimension to the front for easier selection.
                    one_d_unfolded_r = one_d_unfolded.permute(1, 2, 0, 3, 4, 5)
                    # for sparse patches the shape is (unstable_size, batch, in_c, h, w). Batch size is 1 so no need to select here.
                    one_d_unfolded_r = one_d_unfolded_r[
                        last_A.unstable_idx[1], last_A.unstable_idx[2]]
                else:
                    # Append the spec dimension.
                    one_d_unfolded_r = one_d_unfolded.unsqueeze(0)
                patches = last_A.patches * one_d_unfolded_r

                if b_pos is not None:
                    patch_pos = Patches(
                        patches.clamp(min=0), last_A.stride, last_A.padding,
                        last_A.shape, unstable_idx=last_A.unstable_idx,
                        output_shape=last_A.output_shape)
                    bias = bias + self.get_bias(patch_pos, b_pos)
                if b_neg is not None:
                    patch_neg = Patches(
                        patches.clamp(max=0), last_A.stride, last_A.padding,
                        last_A.shape, unstable_idx=last_A.unstable_idx,
                        output_shape=last_A.output_shape)
                    bias = bias + self.get_bias(patch_neg, b_neg)

                # bias = bias.transpose(0,1)
                shape = last_A.shape
                pos_A = last_A.patches.clamp(min=0)
                neg_A = last_A.patches.clamp(max=0)

                def upsample(last_patches, last_A):
                    if last_A.unstable_idx is None:
                        patches = F.interpolate(
                            last_patches.view(shape[0] * shape[1] * shape[2], *shape[3:]),
                            scale_factor=[1,]+self.kernel_size)
                        patches = patches.view(shape[0], shape[1], shape[2], *patches.shape[1:])
                    else:
                        patches = F.interpolate(
                            last_patches, scale_factor=[1,] + self.kernel_size)
                    return Patches(
                        patches, stride=last_A.stride, padding=last_A.padding,
                        shape=patches.shape, unstable_idx=last_A.unstable_idx,
                        output_shape=last_A.output_shape)

                pos_A = upsample(pos_A, last_A)
                neg_A = upsample(neg_A, last_A)

                padding, stride, output_padding = compute_patches_stride_padding(
                    self.input_shape, last_A.padding, last_A.stride, self.padding,
                    self.stride, last_A.inserted_zeros, last_A.output_padding)

                pos_A.padding, pos_A.stride, pos_A.output_padding = padding, stride, output_padding
                neg_A.padding, neg_A.stride, neg_A.output_padding = padding, stride, output_padding

                # unsqueeze for the spec dimension
                d_pos = maybe_unfold_patches(d_pos.unsqueeze(0), pos_A)
                d_neg = maybe_unfold_patches(d_neg.unsqueeze(0), neg_A)

                next_A_patches = self.jit_mutiply(
                    pos_A.patches, neg_A.patches, d_pos, d_neg)

                if start_node is not None:
                    self.patch_size[start_node.name] = next_A_patches.size()

                next_A = Patches(
                    next_A_patches, stride, padding, next_A_patches.shape,
                    unstable_idx=last_A.unstable_idx, output_shape=last_A.output_shape,
                    inserted_zeros=last_A.inserted_zeros, output_padding=output_padding)

            return next_A, bias

        if self.padding[0] > 0:
            upper_d = upper_d[...,self.padding[0]:-self.padding[0],
                              self.padding[0]:-self.padding[0]]

        uA, ubias = _bound_oneside(last_uA, upper_d, lower_d, upper_b, lower_b)
        lA, lbias = _bound_oneside(last_lA, lower_d, upper_d, lower_b, upper_b)

        return [(lA, uA)], lbias, ubias

    def bound_forward(self, dim_in, x):
        lower_d, lower_b, upper_d, upper_b = self.bound_relax(x, init=False)

        def _bound_oneside(w_pos, b_pos, w_neg, b_neg, d, b):
            d_pos, d_neg = d.clamp(min=0), d.clamp(max=0)
            w_new = d_pos.unsqueeze(1) * w_pos + d_neg.unsqueeze(1) * w_neg
            b_new = d_pos * b_pos + d_neg * b_neg
            if isinstance(self.kernel_size, list) and len(self.kernel_size) == 2:
                tot_kernel_size = prod(self.kernel_size)
            elif isinstance(self.kernel_size, int):
                tot_kernel_size = self.kernel_size ** 2
            else:
                raise ValueError(f'Unsupported kernel size {self.kernel_size}')
            w_pooled = (F.avg_pool2d(w_new.view(-1, *w_new.shape[2:]),
                self.kernel_size, self.stride, self.padding,
                ceil_mode=self.ceil_mode) * tot_kernel_size)
            w_pooled = w_pooled.reshape(w_new.shape[0], -1, *w_pooled.shape[1:])
            b_pooled = F.avg_pool2d(b_new, self.kernel_size, self.stride, self.padding,
                ceil_mode=self.ceil_mode) * tot_kernel_size + b
            return w_pooled, b_pooled

        lw, lb = _bound_oneside(x.lw, x.lb, x.uw, x.ub, lower_d, lower_b)
        uw, ub = _bound_oneside(x.uw, x.ub, x.lw, x.lb, upper_d, upper_b)

        return LinearBound(lw, lb, uw, ub)

    def bound_relax(self, x, init=False, dim_opt=None):
        if init:
            self.init_linear_relaxation(x, dim_opt)

        # Only used by forward mode
        paddings = tuple(self.padding + self.padding)
        self.upper, self.lower = x.upper, x.lower

        # A_shape = last_lA.shape if last_lA is not None else last_uA.shape
        # batch_size, input_c, x, y
        upper_d = torch.zeros_like(x.lower)
        lower_d = torch.zeros_like(x.lower)

        upper_d = F.pad(upper_d, paddings)
        lower_d = F.pad(lower_d, paddings)

        # batch_size, output_c, x, y
        upper_b = torch.zeros((list(self.output_shape))).to(x.lower)
        lower_b = torch.zeros((list(self.output_shape))).to(x.lower)

        # 1. find the index i where li > uj for all j, then set upper_d = lower_d = 1
        max_lower, max_lower_index = F.max_pool2d(x.lower, self.kernel_size, self.stride, self.padding, return_indices=True, ceil_mode=self.ceil_mode)
        delete_upper = torch.scatter(torch.flatten(F.pad(x.upper, paddings), -2), -1, torch.flatten(max_lower_index, -2), -torch.inf).view(upper_d.shape)
        max_upper, _ = F.max_pool2d(delete_upper, self.kernel_size, self.stride, 0, return_indices=True, ceil_mode=self.ceil_mode)

        values = torch.zeros_like(max_lower)
        values[max_lower >= max_upper] = 1.0
        upper_d = torch.scatter(torch.flatten(upper_d, -2), -1, torch.flatten(max_lower_index, -2), torch.flatten(values, -2)).view(upper_d.shape)

        if self.opt_stage == 'opt':
            raise NotImplementedError
        else:
            lower_d = torch.scatter(torch.flatten(lower_d, -2), -1,
                                    torch.flatten(max_lower_index, -2),
                                    1.0).view(upper_d.shape)
            if self.padding[0] > 0:
                lower_d = lower_d[...,self.padding[0]:-self.padding[0],
                                  self.padding[0]:-self.padding[0]]

        values[:] = 0.0
        max_upper_, _ = F.max_pool2d(x.upper, self.kernel_size, self.stride,
                                     self.padding, return_indices=True,
                                     ceil_mode=self.ceil_mode)
        values[max_upper > max_lower] = max_upper_[max_upper > max_lower]
        upper_b = values

        if self.padding[0] > 0:
            upper_d = upper_d[...,self.padding[0]:-self.padding[0], self.padding[0]:-self.padding[0]]

        return lower_d, lower_b, upper_d, upper_b

    def dump_alpha(self, device=None, dtype=None, non_blocking=False):
        ret = {'alpha': self._transfer_alpha(self.alpha, device=device, dtype=dtype, non_blocking=non_blocking, require_grad=False)}
        ret['init'] = self.init
        return ret

    def restore_alpha(self, alpha, device=None, dtype=None, non_blocking=False):
        self.alpha = self._transfer_alpha(alpha['alpha'], device=device, dtype=dtype, non_blocking=non_blocking, require_grad=True)
        self.init = alpha['init']

    def drop_unused_alpha(self, keep_nodes):
        for spec_name in list(self.alpha.keys()):
            if spec_name not in keep_nodes:
                del self.alpha[spec_name]
                del self.init[spec_name]

    def build_solver(self, *v, model, C=None, model_type="mip", solver_pkg="gurobi"):
        # e.g., last layer input gurobi vars (3,32,32)
        gvars_array = np.array(v[0])
        # pre_layer_shape (1,32,27,27)
        pre_layer_shape = np.expand_dims(gvars_array, axis=0).shape
        # this layer shape (1,32,6,6)
        this_layer_shape = self.output_shape
        assert this_layer_shape[2] ==  ((2 * self.padding[0] + pre_layer_shape[2] - (self.stride[0] - 1))//self.stride[0])

        new_layer_gurobi_vars = []
        neuron_idx = 0
        pre_ubs = self.forward(self.inputs[0].upper).detach().cpu().numpy()

        for out_chan_idx in range(this_layer_shape[1]):
            out_chan_vars = []
            for out_row_idx in range(this_layer_shape[2]):
                out_row_vars = []
                for out_col_idx in range(this_layer_shape[3]):
                    a_sum = 0.0
                    v = model.addVar(lb=-float('inf'), ub=float('inf'),
                                            obj=0, vtype=grb.GRB.CONTINUOUS,
                                            name=f'lay{self.name}_{neuron_idx}')
                    for ker_row_idx in range(self.kernel_size[0]):
                        in_row_idx = -self.padding[0] + self.stride[0] * out_row_idx + ker_row_idx
                        if (in_row_idx < 0) or (in_row_idx == len(gvars_array[out_chan_idx][ker_row_idx])):
                            # This is padding -> value of 0
                            continue
                        for ker_col_idx in range(self.kernel_size[1]):
                            in_col_idx = -self.padding[1] + self.stride[1] * out_col_idx + ker_col_idx
                            if (in_col_idx < 0) or (in_col_idx == pre_layer_shape[3]):
                                # This is padding -> value of 0
                                continue
                            var = gvars_array[out_chan_idx][in_row_idx][in_col_idx]
                            a = model.addVar(vtype=grb.GRB.BINARY)
                            a_sum += a
                            model.addConstr(v >= var)
                            model.addConstr(v <= var + (1 - a) * pre_ubs[
                                0, out_chan_idx, out_row_idx, out_col_idx])
                    model.addConstr(a_sum == 1, name=f'lay{self.name}_{neuron_idx}_eq')
                    out_row_vars.append(v)
                out_chan_vars.append(out_row_vars)
            new_layer_gurobi_vars.append(out_chan_vars)

        self.solver_vars = new_layer_gurobi_vars
        model.update()


class BoundGlobalAveragePool(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)

    def forward(self, x):
        output = nn.AdaptiveAvgPool2d((1, 1)).forward(x)  # adaptiveAveragePool with output size (1, 1)
        return output

    def bound_backward(self, last_lA, last_uA, x, **kwargs):
        H, W = self.input_shape[-2], self.input_shape[-1]

        lA = (last_lA.expand(list(last_lA.shape[:-2]) + [H, W]) / (H * W)) if last_lA is not None else None
        uA = (last_uA.expand(list(last_uA.shape[:-2]) + [H, W]) / (H * W)) if last_uA is not None else None

        return [(lA, uA)], 0, 0

    def interval_propagate(self, *v):
        h_L, h_U = v[0]
        h_L = F.adaptive_avg_pool2d(h_L, (1, 1))
        h_U = F.adaptive_avg_pool2d(h_U, (1, 1))
        return h_L, h_U


class BoundAveragePool(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        # assumptions: ceil_mode=False, count_include_pad=True
        super().__init__(attr, inputs, output_index, options)

        assert ('pads' not in attr) or (attr['pads'][0] == attr['pads'][2])
        assert ('pads' not in attr) or (attr['pads'][1] == attr['pads'][3])

        self.kernel_size = attr['kernel_shape']
        assert len(self.kernel_size) == 2
        self.stride = attr['strides']
        assert len(self.stride) == 2
        # FIXME (22/07/02): padding is inconsistently handled. Should use 4-tuple.

        if 'pads' not in attr:
            self.padding = [0, 0]
        else:
            self.padding = [attr['pads'][0], attr['pads'][1]]
        self.ceil_mode = False
        self.count_include_pad = True
        self.use_default_ibp = True
        self.relu_followed = False

    def forward(self, x):
        return F.avg_pool2d(x, self.kernel_size, self.stride,
                            self.padding, self.ceil_mode, self.count_include_pad)

    def bound_backward(self, last_lA, last_uA, x, **kwargs):
        def _bound_oneside(last_A):
            if last_A is None:
                return None, 0
            equal_kernel_stride = (self.kernel_size[0] == self.stride[0]
                                   and self.kernel_size[1] == self.stride[1])
            if isinstance(last_A, torch.Tensor):
                shape = last_A.size()
                if equal_kernel_stride:
                    # propagate A to the next layer, with batch concatenated together
                    next_A = F.interpolate(
                        last_A.reshape(shape[0] * shape[1], *shape[2:]),
                        scale_factor=self.kernel_size
                    ) / (prod(self.kernel_size))
                    next_A = F.pad(
                        next_A, (0, self.input_shape[-2] - next_A.shape[-2],
                                 0, self.input_shape[-1] - next_A.shape[-1]))
                    next_A = next_A.view(shape[0], shape[1], *next_A.shape[1:])
                else:
                    # Treat pooling as a general convolution
                    weight = torch.zeros(
                        self.input_shape[1], self.output_shape[1], *self.kernel_size,
                        dtype=last_A.dtype, device=last_A.device)
                    assert self.input_shape[1] == self.output_shape[1]
                    weight = torch.eye(self.input_shape[1], dtype=last_A.dtype, device=last_A.device)
                    weight = weight / prod(self.kernel_size)
                    weight = weight.view(self.output_shape[1], self.input_shape[1], 1, 1)
                    weight = weight.expand(self.output_shape[1], self.input_shape[1], *self.kernel_size)
                    output_padding0 = (
                        int(self.input_shape[2])
                        - (int(self.output_shape[2]) - 1) * self.stride[0]
                        + 2 * self.padding[0] - 1 - (int(weight.size()[2] - 1)))
                    output_padding1 = (
                        int(self.input_shape[3])
                        - (int(self.output_shape[3]) - 1) * self.stride[1]
                        + 2 * self.padding[1] - 1 - (int(weight.size()[3] - 1)))
                    next_A = F.conv_transpose2d(
                        last_A.reshape(shape[0] * shape[1], *shape[2:]), weight, None,
                        stride=self.stride, padding=self.padding,
                        output_padding=(output_padding0, output_padding1))
                    next_A = next_A.view(shape[0], shape[1], *next_A.shape[1:])
            elif isinstance(last_A, Patches):
                patches = last_A.patches
                shape = patches.size()
                # When the number of inserted zeros can cancel out the stride, we use a shortcut that can reduce computation.
                simplify_patch = (equal_kernel_stride
                                  and last_A.inserted_zeros + 1 == self.kernel_size[0]
                                  and self.kernel_size[0] == self.kernel_size[1])
                padding, stride, output_padding = compute_patches_stride_padding(
                    self.input_shape, last_A.padding, last_A.stride,
                    self.padding, self.stride,
                    inserted_zeros=last_A.inserted_zeros,
                    output_padding=last_A.output_padding,
                    simplify=not simplify_patch)
                inserted_zeros = last_A.inserted_zeros
                if equal_kernel_stride and last_A.inserted_zeros == 0:
                    # No inserted zeros, can be handled using interpolate.
                    if last_A.unstable_idx is None:
                        # shape is: [out_C, batch, out_H, out_W, in_c, patch_H, patch_W]
                        up_sampled_patches = F.interpolate(
                            patches.reshape(shape[0] * shape[1],
                                         shape[2] * shape[3], *shape[4:]),
                            scale_factor=[1,] + self.kernel_size)
                        # The dimension of patch-H and patch_W has changed.
                        up_sampled_patches = up_sampled_patches.reshape(
                            *shape[:-2], up_sampled_patches.size(-2),
                            up_sampled_patches.size(-1))
                    else:
                        # shape is: [spec, batch, in_c, patch_H, patch_W]
                        up_sampled_patches = F.interpolate(
                            patches, scale_factor=[1,] + self.kernel_size)
                    # Divided by the averaging factor.
                    up_sampled_patches = up_sampled_patches / prod(self.kernel_size)
                elif simplify_patch:
                    padding = tuple(p // s - o for p, s, o in zip(padding, stride, output_padding))
                    output_padding = (0, 0, 0, 0)
                    stride = 1  # Stride and inserted zero canceled out. No need to insert zeros and add output_padding.
                    inserted_zeros = 0
                    value = 1. / prod(self.kernel_size)
                    # In the case where the stride and adding_zeros cancel out, we do not need to insert zeros.
                    weight = torch.full(
                        size=(self.input_shape[1], 1, *self.kernel_size),
                        fill_value=value, dtype=patches.dtype,
                        device=patches.device)
                    if last_A.unstable_idx is None:
                        # shape is: [out_C, batch, out_H, out_W, in_c, patch_H, patch_W]
                        up_sampled_patches = F.conv_transpose2d(
                            patches.reshape(
                                shape[0] * shape[1] * shape[2] * shape[3],
                                *shape[4:]
                            ), weight, stride=1, groups=self.input_shape[1])
                    else:
                        # shape is: [spec, batch, in_c, patch_H, patch_W]
                        up_sampled_patches = F.conv_transpose2d(
                            patches.reshape(shape[0] * shape[1], *shape[2:]),
                            weight, stride=1, groups=self.input_shape[1])
                    up_sampled_patches = up_sampled_patches.view(
                        *shape[:-2], up_sampled_patches.size(-2), up_sampled_patches.size(-1))
                else:
                    # With inserted zeros, must be handled by treating pooling as general convolution.
                    value = 1. / prod(self.kernel_size)
                    weight = torch.full(size=(self.input_shape[1], 1, *self.kernel_size),
                                        fill_value=value, dtype=patches.dtype,
                                        device=patches.device)
                    if not self.relu_followed:
                        patches = last_A.create_padding(self.output_shape)
                    weight = insert_zeros(weight, last_A.inserted_zeros)
                    if last_A.unstable_idx is None:
                        # shape is: [out_C, batch, out_H, out_W, in_c, patch_H, patch_W]
                        up_sampled_patches = F.conv_transpose2d(
                            patches.reshape(shape[0] * shape[1] * shape[2] * shape[3], *shape[4:]),
                            weight, stride=self.stride,
                            groups=self.input_shape[1])
                    else:
                        # shape is: [spec, batch, in_c, patch_H, patch_W]
                        up_sampled_patches = F.conv_transpose2d(
                            patches.reshape(shape[0] * shape[1], *shape[2:]),
                            weight, stride=self.stride,
                            groups=self.input_shape[1])
                    up_sampled_patches = up_sampled_patches.view(
                        *shape[:-2], up_sampled_patches.size(-2),
                        up_sampled_patches.size(-1))
                next_A = last_A.create_similar(
                    up_sampled_patches, stride=stride, padding=padding,
                    output_padding=output_padding,
                    inserted_zeros=inserted_zeros)
            else:
                raise ValueError(f'last_A has unexpected type {type(last_A)}')
            return next_A, 0.

        lA, lbias = _bound_oneside(last_lA)
        uA, ubias = _bound_oneside(last_uA)
        return [(lA, uA)], lbias, ubias

    def build_solver(self, *v, model, C=None, model_type="mip", solver_pkg="gurobi"):
        # e.g., last layer input gurobi vars (3,32,32)
        gvars_array = np.array(v[0])
        # pre_layer_shape (1,32,27,27)
        pre_layer_shape = np.expand_dims(gvars_array, axis=0).shape
        # this layer shape (1,32,6,6)
        this_layer_shape = self.output_shape
        assert this_layer_shape[2] ==  (
            (2 * self.padding[0] + pre_layer_shape[2] - (self.stride[0] - 1)
        ) // self.stride[0])

        value = 1.0/(self.kernel_size[0] * self.kernel_size[1])
        new_layer_gurobi_vars = []
        neuron_idx = 0
        for out_chan_idx in range(this_layer_shape[1]):
            out_chan_vars = []
            for out_row_idx in range(this_layer_shape[2]):
                out_row_vars = []
                for out_col_idx in range(this_layer_shape[3]):
                    # print(self.bias.shape, out_chan_idx, out_lbs.size(1))
                    lin_expr = 0.0
                    for ker_row_idx in range(self.kernel_size[0]):
                        in_row_idx = -self.padding[0] + self.stride[0] * out_row_idx + ker_row_idx
                        if (in_row_idx < 0) or (in_row_idx == len(gvars_array[out_chan_idx][ker_row_idx])):
                            # This is padding -> value of 0
                            continue
                        for ker_col_idx in range(self.kernel_size[1]):
                            in_col_idx = -self.padding[1] + self.stride[1] * out_col_idx + ker_col_idx
                            if (in_col_idx < 0) or (in_col_idx == pre_layer_shape[3]):
                                # This is padding -> value of 0
                                continue
                            coeff = value
                            lin_expr += coeff * gvars_array[out_chan_idx][in_row_idx][in_col_idx]
                    v = model.addVar(lb=-float('inf'), ub=float('inf'),
                                            obj=0, vtype=grb.GRB.CONTINUOUS,
                                            name=f'lay{self.name}_{neuron_idx}')
                    model.addConstr(lin_expr == v, name=f'lay{self.name}_{neuron_idx}_eq')
                    neuron_idx += 1

                    out_row_vars.append(v)
                out_chan_vars.append(out_row_vars)
            new_layer_gurobi_vars.append(out_chan_vars)

        self.solver_vars = new_layer_gurobi_vars
        model.update()

================================================
FILE: auto_LiRPA/operators/reduce.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
""" Reduce operators"""
from .base import *
from torch.nn import Module


class BoundReduce(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.axis = attr.get('axes', None)
        self.keepdim = bool(attr['keepdims']) if 'keepdims' in attr else True
        self.use_default_ibp = True

    def _parse_input_and_axis(self, *x):
        if len(x) > 1:
            assert not self.is_input_perturbed(1)
            self.axis = tuple(item.item() for item in tuple(x[1]))
        self.axis = self.make_axis_non_negative(self.axis)
        return x[0]

    def _return_bound_backward(self, lA, uA):
        return [(lA, uA)] + [(None, None)] * (len(self.inputs) - 1), 0, 0


class BoundReduceMax(BoundReduce):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        """Assume that the indexes with the maximum values are not perturbed.
        This generally doesn't hold true, but can still be used for the input shift
        in Softmax of Transformers."""
        self.fixed_max_index = options.get('fixed_reducemax_index', False)

    def _parse_input_and_axis(self, *x):
        x = super()._parse_input_and_axis(*x)
        # for torch.max, `dim` must be an int
        if isinstance(self.axis, tuple):
            assert len(self.axis) == 1
            self.axis = self.axis[0]
        return x

    def forward(self, *x):
        x = self._parse_input_and_axis(*x)
        res = torch.max(x, dim=self.axis, keepdim=self.keepdim)
        self.indices = res.indices
        return res.values

    def bound_backward(self, last_lA, last_uA, *args, **kwargs):
        if self.fixed_max_index:
            def _bound_oneside(last_A):
                if last_A is None:
                    return None
                indices = self.indices.unsqueeze(0)
                if not self.keepdim:
                    assert (self.from_input)
                    last_A = last_A.unsqueeze(self.axis + 1)
                    indices = indices.unsqueeze(self.axis + 1)
                shape = list(last_A.shape)
                shape[self.axis + 1] *= self.input_shape[self.axis]
                A = torch.zeros(shape, device=last_A.device)
                indices = indices.expand(*last_A.shape)
                A.scatter_(dim=self.axis + 1, index=indices, src=last_A)
                return A

            return self._return_bound_backward(_bound_oneside(last_lA),
                                               _bound_oneside(last_uA))
        else:
            raise NotImplementedError(
                '`bound_backward` for BoundReduceMax with perturbed maximum'
                'indexes is not implemented.')
        
    def build_gradient_node(self, grad_upstream):
        if self.fixed_max_index:
            node_grad = ReduceMaxGrad(self.axis, self.keepdim, self.input_shape, self.indices)
            return [(node_grad, (grad_upstream,), [])]
        else:
            raise NotImplementedError(
                '`build_gradient_node` for BoundReduceMax with perturbed maximum'
                'indexes is not implemented.')


class ReduceMaxGrad(Module):
    def __init__(self, axis, keepdim, input_shape, indices):
        super().__init__()
        self.axis = axis
        self.keepdim = keepdim
        self.input_shape = input_shape
        self.indices = indices.unsqueeze(0)

    def forward(self, grad_last):
        # Only keep the gradient at the maximum index
        # The gradient at other indices is 0
        # If keepdim is False, add a singleton dimension at the specified axis
        if not self.keepdim:
            grad_last = grad_last.unsqueeze(self.axis + 1)
            indices = self.indices.unsqueeze(self.axis + 1)
        else:
            indices = self.indices
            assert grad_last.shape[self.axis + 1] == 1
        # Calculate the target dimension size at axis + 1
        new_dim = self.input_shape[self.axis]
        # Create the output tensor shape
        new_shape = list(grad_last.shape)
        new_shape[self.axis + 1] = new_dim

        ########################################################################
        # TODO: The following lines are equivalent to:
        #
        # grad = torch.zeros(new_shape, device=grad_last.device)
        # indices = indices.expand(*grad_last.shape)
        # grad.scatter_(dim=self.axis + 1, index=indices, src=grad_last)
        #
        # But auto_LiRPA does not support scatter_ yet.
        # So we use a workaround to avoid using scatter_.
        ########################################################################

        # Expand indices to match the target shape,
        # filling axis + 1 with new_dim
        indices_expanded = indices.expand(
            *grad_last.shape[:self.axis + 1],
            new_dim,
            *grad_last.shape[self.axis + 2:]
            ).to(grad_last.device)
        # Create a coordinate tensor for comparison along axis + 1
        coord_shape = [1] * grad_last.dim()
        coord_shape[self.axis + 1] = new_dim
        coord = torch.arange(new_dim, device=grad_last.device).view(*coord_shape)
        # Create a binary mask where 1 indicates the desired position for each gradient
        mask = (coord == indices_expanded).type_as(grad_last)
        # Expand grad_last to match the target shape for element-wise multiplication
        grad_last_expanded = grad_last.expand(
            *grad_last.shape[:self.axis + 1],
            new_dim,
            *grad_last.shape[self.axis + 2:])
        # Use the mask to retain values only at the correct positions
        grad = mask * grad_last_expanded
        return grad


class BoundReduceMin(BoundReduceMax):
    def forward(self, *x):
        x = self._parse_input_and_axis(*x)
        res = torch.min(x, dim=self.axis, keepdim=self.keepdim)
        self.indices = res.indices
        return res.values


class BoundReduceMean(BoundReduce):
    def forward(self, *x):
        x = self._parse_input_and_axis(*x)
        return torch.mean(x, dim=self.axis, keepdim=self.keepdim)

    def bound_backward(self, last_lA, last_uA, *args, **kwargs):
        def _bound_oneside(last_A):
            if last_A is None:
                return None
            if not self.keepdim:
                assert (self.from_input)
                for axis in self.axis:
                    if axis > 0:
                        last_A = last_A.unsqueeze(axis + 1)
            shape = list(last_A.shape)
            shape[2:] = self.input_shape[1:]
            # We perform expansion as in BoundReduceSum. 
            # and divide the product of the sizes of the reduced dimensions.
            last_A = last_A.expand(*shape) / np.prod(np.take(self.input_shape, self.axis))
            return last_A

        return self._return_bound_backward(_bound_oneside(last_lA),
                                           _bound_oneside(last_uA))

    def bound_forward(self, dim_in, x, *args):
        assert self.keepdim
        assert len(self.axis) == 1
        axis = self.make_axis_non_negative(self.axis[0])
        assert (axis > 0)
        size = self.input_shape[axis]
        lw = x.lw.sum(dim=axis + 1, keepdim=True) / size
        lb = x.lb.sum(dim=axis, keepdim=True) / size
        uw = x.uw.sum(dim=axis + 1, keepdim=True) / size
        ub = x.ub.sum(dim=axis, keepdim=True) / size
        return LinearBound(lw, lb, uw, ub)


class BoundReduceSum(BoundReduce):
    def forward(self, *x):
        x = self._parse_input_and_axis(*x)
        if self.axis is not None:
            return torch.sum(x, dim=self.axis, keepdim=self.keepdim)
        else:
            return torch.sum(x)

    def bound_backward(self, last_lA, last_uA, x, *args, **kwargs):
        def _bound_oneside(last_A):
            if last_A is None:
                return None
            if not self.keepdim:
                assert (self.from_input)
                for axis in self.axis:
                    if axis > 0:
                        last_A = last_A.unsqueeze(axis + 1)
            # last_A.shape = [num_spec, batch_size, ..., dim_size_1 (1), ...]
            shape = list(last_A.shape)
            # self.input_shape = [batch_size_original, ..., dim_size_1_before_reduction, ...]
            # we expand last_A with keeping its batch_size instead of that from self.input_shape.
            shape[2:] = self.input_shape[1:]
            # For reduced dims, their dim_size will be expanded from 1 to the original size.
            # For non-reduced dims, their dim_size will be unchanged.
            last_A = last_A.expand(*shape)
            return last_A

        return self._return_bound_backward(_bound_oneside(last_lA),
                                           _bound_oneside(last_uA))

    def bound_forward(self, dim_in, x, *args):
        # Handle possibly multiple axes
        axes = [self.make_axis_non_negative(ax) for ax in self.axis]
        # Ensure all axes are greater than 0 (not batch dimension)
        assert all(ax > 0 for ax in axes)
        # For lw/uw, need to shift by 1 due to an extra leading dimension (num_spec)
        lw = x.lw.sum(dim=[ax + 1 for ax in axes], keepdim=self.keepdim)
        lb = x.lb.sum(dim=axes, keepdim=self.keepdim)
        uw = x.uw.sum(dim=[ax + 1 for ax in axes], keepdim=self.keepdim)
        ub = x.ub.sum(dim=axes, keepdim=self.keepdim)
        return LinearBound(lw, lb, uw, ub)

    def build_gradient_node(self, grad_upstream):
        node_grad = ReduceSumGrad(self.axis, self.keepdim, self.input_shape)
        return [(node_grad, (grad_upstream,), [])]
        

class ReduceSumGrad(Module):
    def __init__(self, axis, keepdim, input_shape):
        super().__init__()
        self.axis = axis
        self.keepdim = keepdim
        self.input_shape = input_shape
    
    def forward(self, grad_last):
        grad_new = grad_last.clone()
        if not self.keepdim:
            for axis in self.axis:
                if axis > 0:
                    grad_new = grad_new.unsqueeze(axis + 1)
        # For ReduceSum, ∂y/∂x = 1, so we just need to expand the gradient
        # along each axis that is reduced.
        shape = list(grad_new.shape)
        shape[2:] = self.input_shape[1:]
        grad_new = grad_new.expand(*shape)
        return grad_new


================================================
FILE: auto_LiRPA/operators/relu.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
"""BoundRelu."""
from typing import Optional, Tuple
import torch
from torch import Tensor
from torch.nn import Module
from torch.autograd import Function
from collections import OrderedDict
from .base import *
from .clampmult import multiply_by_A_signs
from .activation_base import BoundActivation, BoundOptimizableActivation
from .solver_utils import grb
from ..utils import unravel_index, prod


class BoundTwoPieceLinear(BoundOptimizableActivation):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        if options is None:
            options = {}
        self.options = options
        self.ibp_intermediate = True
        self.splittable = True
        self.relu_options = options.get('activation_bound_option', 'adaptive')
        self.use_sparse_spec_alpha = options.get('sparse_spec_alpha', False)
        self.use_sparse_features_alpha = options.get('sparse_features_alpha', False)
        self.alpha_lookup_idx = self.alpha_indices = None
        self.beta = self.masked_beta = self.sparse_betas = None
        self.split_beta_used = False
        self.history_beta_used = False
        self.flattened_nodes = None
        self.patch_size = {}
        self.cut_used = False
        self.cut_module = None
        self.gcp_unstable_relu_indicators = None

    def init_opt_parameters(self, start_nodes):
        ref = self.inputs[0].lower # a reference variable for getting the shape
        batch_size = ref.size(0)
        self.alpha = OrderedDict()
        self.alpha_lookup_idx = OrderedDict()  # For alpha with sparse spec dimention.
        self.alpha_indices = None  # indices of non-zero alphas.
        verbosity = self.options.get('verbosity', 0)

        # Alpha can be sparse in both spec dimension, and the C*H*W dimension.
        # We first deal with the sparse-feature alpha, which is sparse in the
        # C*H*W dimesnion of this layer.
        minimum_sparsity = self.options.get('minimum_sparsity', 0.9)
        if (self.use_sparse_features_alpha
                and self.inputs[0].is_lower_bound_current()
                and self.inputs[0].is_upper_bound_current()):
            # Pre-activation bounds available, we will store the alpha for unstable neurons only.
            # Since each element in a batch can have different unstable neurons,
            # for simplicity we find a super-set using any(dim=0).
            # This can be non-ideal if the x in a batch are very different.
            self.get_unstable_idx()
            total_neuron_size = self.inputs[0].lower.numel() // batch_size
            if self.alpha_indices[0].size(0) <= minimum_sparsity * total_neuron_size:
                # Shape is the number of unstable neurons in this layer.
                alpha_shape = [self.alpha_indices[0].size(0)]
                # Skip the batch, spec dimension, and find the lower slopes for all unstable neurons.
                if len(self.alpha_indices) == 1:
                    # This layer is after a linear layer.
                    alpha_init = self.init_d[:, :, self.alpha_indices[0]]
                elif len(self.alpha_indices) == 3:
                    # This layer is after a conv2d layer.
                    alpha_init = self.init_d[
                        :, :, self.alpha_indices[0], self.alpha_indices[1],
                        self.alpha_indices[2]]
                elif len(self.alpha_indices) == 2:
                    # This layer is after a conv1d layer.
                    alpha_init = self.init_d[
                                 :, :, self.alpha_indices[0], self.alpha_indices[1]]
                else:
                    raise ValueError
                if verbosity > 0:
                    print(f'layer {self.name} using sparse-features alpha with shape {alpha_shape}; unstable size '
                          f'{self.alpha_indices[0].size(0)}; total size {total_neuron_size} ({list(ref.shape)})')
            else:
                alpha_shape = self.shape  # Full alpha.
                alpha_init = self.init_d
                if verbosity > 0:
                    print(f'layer {self.name} using full alpha with shape {alpha_shape}; unstable size '
                          f'{self.alpha_indices[0].size(0)}; total size {total_neuron_size} ({list(ref.shape)})')
                self.alpha_indices = None  # Use full alpha.
        else:
            alpha_shape = self.shape  # Full alpha.
            alpha_init = self.init_d
        # Now we start to create alphas for all start nodes.
        # When sparse-spec feature is enabled, alpha is created for only
        # unstable neurons in start node.
        for start_node in start_nodes:
            ns, output_shape, unstable_idx = start_node[:3]
            if isinstance(output_shape, (list, tuple)):
                if len(output_shape) > 1:
                    size_s = prod(output_shape)  # Conv layers.
                else:
                    size_s = output_shape[0]
            else:
                size_s = output_shape
            # unstable_idx may be a tensor (dense layer or conv layer
            # with shared alpha), or tuple of 3-d tensors (conv layer with
            # non-sharing alpha).
            sparsity = float('inf') if unstable_idx is None else unstable_idx.size(0) if isinstance(unstable_idx, torch.Tensor) else unstable_idx[0].size(0)
            if sparsity <= minimum_sparsity * size_s and self.use_sparse_spec_alpha:
                # For fully connected layer, or conv layer with shared alpha per channel.
                # shape is (2, sparse_spec, batch, this_layer_shape)
                # We create sparse specification dimension, where the spec dimension of alpha only includes slopes for unstable neurons in start_node.
                self.alpha[ns] = torch.empty([self.alpha_size, sparsity + 1, batch_size, *alpha_shape],
                                             dtype=torch.float, device=ref.device, requires_grad=True)
                self.alpha[ns].data.copy_(alpha_init.data)  # This will broadcast to (2, sparse_spec) dimensions.
                if verbosity > 0:
                    print(f'layer {self.name} start_node {ns} using sparse-spec alpha {list(self.alpha[ns].size())}'
                          f' with unstable size {sparsity} total_size {size_s} output_shape {output_shape}')
                # unstable_idx is a list of used neurons (or channels for BoundConv) for the start_node.
                assert unstable_idx.ndim == 1 if isinstance(unstable_idx, torch.Tensor) else unstable_idx[0].ndim == 1
                # We only need to the alpha for the unstable neurons in start_node.
                indices = torch.arange(1, sparsity + 1, device=alpha_init.device, dtype=torch.long)
                if isinstance(output_shape, int) or len(output_shape) == 1:
                    # Fully connected layers, or conv layer in patches mode with partially shared alpha (pixels in the same channel use the same alpha).
                    self.alpha_lookup_idx[ns] = torch.zeros(size_s, dtype=torch.long, device=alpha_init.device)
                    # This lookup table maps the unstable_idx to the actual alpha location in self.alpha[ns].
                    # Note that self.alpha[ns][:,0] is reserved for any unstable neurons that are not found in the lookup table. This usually should not
                    # happen, unless reference bounds are not properly set.
                    self.alpha_lookup_idx[ns].data[unstable_idx] = indices
                else:
                    # conv layer in matrix mode, or in patches mode but with non-shared alpha. The lookup table is 3-d.
                    assert len(output_shape) == 3
                    self.alpha_lookup_idx[ns] = torch.zeros(output_shape, dtype=torch.long, device=alpha_init.device)
                    if isinstance(unstable_idx, torch.Tensor):
                        # Convert the unstable index from flattend 1-d to 3-d. (matrix mode).
                        unstable_idx_3d = unravel_index(unstable_idx, output_shape)
                    else:
                        # Patches mode with non-shared alpha, unstable_idx is already 3d.
                        unstable_idx_3d = unstable_idx
                    # Build look-up table.
                    self.alpha_lookup_idx[ns].data[unstable_idx_3d[0], unstable_idx_3d[1], unstable_idx_3d[2]] = indices
            else:
                # alpha shape is (2, spec, batch, this_layer_shape). "this_layer_shape" may still be sparse.
                self.alpha[ns] = torch.empty([self.alpha_size, size_s, batch_size, *alpha_shape],
                                             dtype=torch.float, device=ref.device, requires_grad=True)
                self.alpha[ns].data.copy_(alpha_init.data)  # This will broadcast to (2, spec) dimensions
                if verbosity > 0:
                    print(f'layer {self.name} start_node {ns} using full alpha {list(self.alpha[ns].size())} with unstable '
                          f'size {sparsity if unstable_idx is not None else None} total_size {size_s} output_shape {output_shape}')
                # alpha_lookup_idx can be used for checking if sparse alpha is used or not.
                self.alpha_lookup_idx[ns] = None

    def select_alpha_by_idx(self, last_lA, last_uA, unstable_idx, start_node):
        # Each alpha has shape (2, output_shape, batch_size, *relu_node_shape].
        # If slope is shared, output_shape will be 1.
        # The *relu_node_shape might be sparse (sparse-feature alpha), where the non-zero values are indicated by self.alpha_indices.
        # The out_shape might be sparse (sparse-spec alpha), where the non-zero values are indexed by self.alpha_lookup_idx.
        if unstable_idx is not None:
            # print(f'relu layer {self.name}, start_node {start_node}, unstable_idx {type(unstable_idx)} alpha idx {self.alpha_lookup_idx[start_node.name].size()}')
            if self.alpha_lookup_idx is not None:
                alpha_lookup_idx = self.alpha_lookup_idx[start_node.name]
            else:
                alpha_lookup_idx = None
            if isinstance(unstable_idx, tuple):
                # Start node is a conv node.
                selected_alpha = self.alpha[start_node.name]
                if isinstance(last_lA, Tensor) or isinstance(last_uA, Tensor):
                    # Start node is a conv node but we received tensors as A matrices.
                    # Patches mode converted to matrix, or matrix mode used. Need to select accross the spec dimension.
                    # For this node, since it is in matrix mode, the spec dimension is out_c * out_h * out_w
                    # Shape is [2, spec, batch, *this_layer_shape]
                    if alpha_lookup_idx is None:
                        if self.options['optimize_bound_args'].get('use_shared_alpha', False):
                            # alpha is shared, and its spec dimension is always 1. In this case we do not need to select.
                            # selected_alpha will have shape [2, 1, batch, *this_layer_shape]
                            pass
                        else:
                            # alpha is not shared, so it has shape [2, spec, batch, *this_layer_shape]
                            # Reshape the spec dimension to c*h*w so we can select used alphas based on unstable index.
                            # Shape becomes [2, out_c, out_h, out_w, batch, *this_layer_shape]
                            selected_alpha = selected_alpha.view(selected_alpha.size(0), *start_node.output_shape[1:], *selected_alpha.shape[2:])
                            selected_alpha = selected_alpha[:, unstable_idx[0], unstable_idx[1], unstable_idx[2]]
                    else:
                        assert alpha_lookup_idx.ndim == 3
                        # We only stored some alphas, and A is also sparse, so the unstable_idx must be first translated to real indices.
                        # alpha shape is (2, sparse_spec_shape, batch_size, *relu_node_shape) where relu_node_shape can also be sparse.
                        # We use sparse-spec alphas. Need to convert these unstable_idx[0], unstable_idx[1], unstable_idx[0] using lookup table.
                        _unstable_idx = alpha_lookup_idx[unstable_idx[0], unstable_idx[1], unstable_idx[2]]
                        selected_alpha = self.non_deter_index_select(selected_alpha, index=_unstable_idx, dim=1)
                else:
                    # Patches mode. Alpha must be selected after unfolding, so cannot be done here.
                    # Selection is deferred to maybe_unfold() using alpha_lookup_idx.
                    # For partially shared alpha, its shape is (2, out_c, batch_size, *relu_node_shape).
                    # For full alpha, its shape is (2, out_c*out_h*out_w, batch_size, *relu_node_shape).
                    # Both the spec dimension and relu_node_shape dimensions can be sparse.
                    pass
            elif unstable_idx.ndim == 1:
                # Start node is a FC node.
                # Only unstable neurons of the start_node neurons are used.
                assert alpha_lookup_idx is None or alpha_lookup_idx.ndim == 1
                if self.options['optimize_bound_args'].get('use_shared_alpha', False):
                    # Shared alpha is used, all output specs use the same alpha. No selection is needed.
                    # The spec dim is 1 and will be broadcast.
                    selected_alpha = self.alpha[start_node.name]
                else:
                    _unstable_idx = alpha_lookup_idx[unstable_idx] if alpha_lookup_idx is not None else unstable_idx
                    selected_alpha = self.non_deter_index_select(self.alpha[start_node.name], index=_unstable_idx, dim=1)
            elif unstable_idx.ndim == 2:
                assert alpha_lookup_idx is None, "sparse spec alpha has not been implemented yet."
                # Each element in the batch selects different neurons.
                selected_alpha = batched_index_select(self.alpha[start_node.name], index=unstable_idx, dim=1)
            else:
                raise ValueError
        else:
            # Spec dimension is dense. Alpha must not be created sparsely.
            assert self.alpha_lookup_idx is None or self.alpha_lookup_idx[start_node.name] is None
            selected_alpha = self.alpha[start_node.name]
            alpha_lookup_idx = None
        return selected_alpha, alpha_lookup_idx

    def reconstruct_full_alpha(self, sparse_alpha, full_alpha_shape, alpha_indices):
        full_alpha = torch.zeros(full_alpha_shape, dtype=sparse_alpha.dtype, device=sparse_alpha.device)
        if len(alpha_indices) == 1:
            # Relu after a dense layer.
            full_alpha[:, :, alpha_indices[0]] = sparse_alpha
        elif len(alpha_indices) == 3:
            # Relu after a conv2d layer.
            full_alpha[:, :, alpha_indices[0], alpha_indices[1], alpha_indices[2]] = sparse_alpha
        elif len(alpha_indices) == 2:
            # Relu after a conv1d layer.
            full_alpha[:, :, alpha_indices[0], alpha_indices[1]] = sparse_alpha
        else:
            raise ValueError
        return full_alpha

    def bound_backward(self, last_lA, last_uA, x=None, start_node=None,
                       unstable_idx=None, reduce_bias=True, **kwargs):
        """
        start_node: the name of the layer where the backward bound propagation starts.
                    Can be the output layer or an intermediate layer.
        unstable_idx: indices for the unstable neurons, whose bounds need to be computed.
                      Either be a tuple (for patches) or a 1-D tensor.
        """
        lower = x.lower
        upper = x.upper
        # Get element-wise CROWN linear relaxations.
        (upper_d, upper_b, lower_d, lower_b, lb_lower_d, ub_lower_d,
            lb_upper_d, ub_upper_d, lb_upper_b, ub_upper_b, alpha_lookup_idx) = \
            self._backward_relaxation(last_lA, last_uA, x, start_node, unstable_idx)
        # save for calculate babsr score
        self.d = upper_d
        self.lA = last_lA
        # Save for initialization bounds.
        self.init_d = lower_d

        # Choose upper or lower bounds based on the sign of last_A
        def _bound_oneside(last_A, d_pos, d_neg, b_pos, b_neg):
            if last_A is None:
                return None, 0
            # Obtain the new linear relaxation coefficients based on the signs in last_A.
            same_slope = True if self.relu_options == "same-slope" else False
            _A, _bias = multiply_by_A_signs(
                last_A, d_pos, d_neg, b_pos, b_neg, reduce_bias=reduce_bias, same_slope=same_slope)
            if isinstance(last_A, Patches):
                # Save the patch size, which will be used in init_alpha() to determine the number of optimizable parameters.
                A_prod = _A.patches
                if start_node is not None:
                    if last_A.unstable_idx is not None:
                        # Sparse patches, we need to construct the full patch size: (out_c, batch, out_h, out_w, c, h, w).
                        self.patch_size[start_node.name] = [
                            last_A.output_shape[1], A_prod.size(1),
                            last_A.output_shape[2], last_A.output_shape[3],
                            A_prod.size(-3), A_prod.size(-2), A_prod.size(-1)]
                    else:
                        # Regular patches.
                        self.patch_size[start_node.name] = A_prod.size()
            return _A, _bias

        ######## A problem with patches mode for cut constraint start ##########
        # There are cases that  the node that is in the constraint but not selected by the patches for the output node
        # trick: only count the small patches that have all the split node coeffs[ci].sum() equal to coeffs_unfolded[ci][out_h, out_w, -1].sum()
        # we should force these beta to be 0 to disable the effect of these constraints
        A = last_lA if last_lA is not None else last_uA
        current_layer_shape = lower.size()[1:]
        if self.cut_used and type(A) is Patches:
            self.cut_module.patch_trick(start_node, self.name, A, current_layer_shape)
        ######## A problem with patches mode for cut constraint end ##########

        if self.cut_used:
            if self.leaky_alpha > 0:
                raise NotImplementedError
            # propagate postrelu node in cut constraints
            last_lA, last_uA = self.cut_module.relu_cut(
                start_node, self.name, last_lA, last_uA, current_layer_shape,
                unstable_idx, batch_mask=self.inputs[0].alpha_beta_update_mask)

        # In patches mode we might need an unfold.
        # lower_d, upper_d, lower_b, upper_b: 1, batch, current_c, current_w, current_h or None
        upper_d = maybe_unfold_patches(upper_d, last_lA if last_lA is not None else last_uA)
        lower_d = maybe_unfold_patches(lower_d, last_lA if last_lA is not None else last_uA)
        upper_b = maybe_unfold_patches(upper_b, last_lA if last_lA is not None else last_uA)
        lower_b = maybe_unfold_patches(lower_b, last_lA if last_lA is not None else last_uA)  # for ReLU it is always None; keeping it here for completeness.
        # ub_lower_d and lb_lower_d might have sparse spec dimension, so they may need alpha_lookup_idx to convert to actual spec dim.
        ub_lower_d = maybe_unfold_patches(ub_lower_d, last_uA, alpha_lookup_idx=alpha_lookup_idx)
        ub_upper_d = maybe_unfold_patches(ub_upper_d, last_uA, alpha_lookup_idx=alpha_lookup_idx)
        # optimizable slope lb_lower_d: spec (only channels in spec layer), batch, current_c, current_w, current_h
        # patches mode lb_lower_d after unfold: unstable, batch, in_C, H, W
        lb_lower_d = maybe_unfold_patches(lb_lower_d, last_lA, alpha_lookup_idx=alpha_lookup_idx)
        lb_upper_d = maybe_unfold_patches(lb_upper_d, last_lA, alpha_lookup_idx=alpha_lookup_idx)
        # ub_upper_b and lb_upper_b can also be optimizable variables, just like ub/lb_upper/lower_d.
        # This is only possible when alpha is optimized in the "same-slope" setting, where we move the linear upper bound together with the lower bound.
        ub_upper_b = maybe_unfold_patches(ub_upper_b, last_lA, alpha_lookup_idx=alpha_lookup_idx)
        lb_upper_b = maybe_unfold_patches(lb_upper_b, last_lA, alpha_lookup_idx=alpha_lookup_idx)

        if self.cut_used:
            assert reduce_bias
            # Here, we create a tuple includes 3 masks:
            # unstable_indicators. unstable neuron mask.
            # positive_indicators. previous unstable now split on z = 1.
            # negative_indicators. previous unstable now split on z = 0.
            unstable_indicators = (lower < 0) * (upper > 0)
            positive_indicators = ~(lower < 0) & self.gcp_unstable_relu_indicators
            negative_indicators = ~(upper > 0) & self.gcp_unstable_relu_indicators
            relu_indicators = (unstable_indicators, positive_indicators, negative_indicators)
            # propagate integer var of relu neuron (arelu) in cut constraints through relu layer
            lA, uA, lbias, ubias = self.cut_module.arelu_cut(
                start_node, self.name, last_lA, last_uA, lower_d, upper_d,
                lower_b, upper_b, lb_lower_d, ub_lower_d, relu_indicators, x, self.patch_size,
                current_layer_shape, unstable_idx,
                batch_mask=self.inputs[0].alpha_beta_update_mask)
        else:
            uA, ubias = _bound_oneside(
                last_uA, ub_upper_d if upper_d is None else upper_d,
                ub_lower_d if lower_d is None else lower_d,
                ub_upper_b if ub_upper_b is not None else upper_b, lower_b)
            lA, lbias = _bound_oneside(
                last_lA, lb_lower_d if lower_d is None else lower_d,
                lb_upper_d if upper_d is None else upper_d,
                lower_b, lb_upper_b if lb_upper_b is not None else upper_b)

        if self.cut_used:
            # propagate prerelu node in cut constraints
            lA, uA = self.cut_module.pre_cut(
                start_node, self.name, lA, uA, current_layer_shape, unstable_idx,
                batch_mask=self.inputs[0].alpha_beta_update_mask)
        self.masked_beta_lower = self.masked_beta_upper = None

        return [(lA, uA)], lbias, ubias

    def _transfer_alpha_lookup_idx(self, alpha_lookup_idx, device=None, dtype=None, non_blocking=False):
        if alpha_lookup_idx is None:
            return None
        alpha_lookup_idx = {spec_name: transfer(idx, device=device, dtype=dtype, non_blocking=non_blocking) if idx is not None else None
                            for spec_name, idx in alpha_lookup_idx.items()}
        return alpha_lookup_idx

    def _transfer_alpha_indices(self, alpha_indices, device=None, dtype=None, non_blocking=False):
        if alpha_indices is None:
            return None
        alpha_indices = [transfer(indices, device=device, dtype=dtype, non_blocking=non_blocking) for indices in alpha_indices]
        return alpha_indices

    def dump_alpha(self, device=None, dtype=None, non_blocking=False):
        ret = {'alpha': self._transfer_alpha(self.alpha, device=device, dtype=dtype, non_blocking=non_blocking, require_grad=False)}
        if self.use_sparse_spec_alpha:
            ret['alpha_lookup_idx'] = self._transfer_alpha_lookup_idx(self.alpha_lookup_idx, device=device, dtype=None, non_blocking=non_blocking)
        if self.use_sparse_features_alpha:
            ret['alpha_indices'] = self._transfer_alpha_indices(self.alpha_indices, device=device, dtype=None, non_blocking=non_blocking)
        return ret

    def restore_alpha(self, alpha, device=None, dtype=None, non_blocking=False):
        self.alpha = self._transfer_alpha(alpha['alpha'], device=device, dtype=dtype, non_blocking=non_blocking, require_grad=True)
        if self.use_sparse_spec_alpha:
            self.alpha_lookup_idx = self._transfer_alpha_lookup_idx(alpha['alpha_lookup_idx'], device=device, dtype=None, non_blocking=non_blocking)
        if self.use_sparse_features_alpha:
            self.alpha_indices = self._transfer_alpha_indices(alpha['alpha_indices'], device=device, dtype=None, non_blocking=non_blocking)

    def drop_unused_alpha(self, keep_nodes):
        for spec_name in list(self.alpha.keys()):
            # If the spec_name is not in keep_nodes, we delete it.
            if spec_name not in keep_nodes:
                del self.alpha[spec_name]
                # if use_sparse_spec_alpha is True, we also delete the alpha_lookup_idx if needed.
                if self.use_sparse_spec_alpha:
                    del self.alpha_lookup_idx[spec_name]

        # if there is no alpha left and use_sparse_features_alpha is True,
        # we also delete the alpha_indices.
        if not self.alpha and self.use_sparse_features_alpha:
            self.alpha_indices = None


class BoundRelu(BoundTwoPieceLinear):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        if attr is None:
            attr = {}
        self.leaky_alpha = attr.get('alpha', 0)
        self.alpha_size = 2
        # Alpha dimension is (2, output_shape, batch, *shape) for ReLU.

    def get_unstable_idx(self):
        self.alpha_indices = torch.logical_and(
            self.inputs[0].lower < 0, self.inputs[0].upper > 0).any(dim=0).nonzero(as_tuple=True)

    def clip_alpha(self):
        for v in self.alpha.values():
            v.data = torch.clamp(v.data, self.leaky_alpha, 1.)

    def forward(self, x):
        self.shape = x.shape[1:]
        if self.flattened_nodes is None:
            self.flattened_nodes = x[0].reshape(-1).shape[0]
        if self.leaky_alpha > 0:
            return F.leaky_relu(x, negative_slope=self.leaky_alpha)
        else:
            return F.relu(x)

    def _relu_lower_bound_init(self, upper_k):
        """Return the initial lower bound without relaxation."""
        if self.relu_options == "same-slope":
            # the same slope for upper and lower
            lower_k = upper_k
        elif self.relu_options == "zero-lb":
            # Always use slope 0 as lower bound. Any value between 0 and 1 is a valid lower bound for CROWN
            lower_k = torch.zeros_like(upper_k)
            lower_k = (upper_k >= 1.0).to(upper_k)
            if self.leaky_alpha > 0:
                lower_k += (upper_k < 1.0).to(upper_k) * self.leaky_alpha
        elif self.relu_options == "one-lb":
            # Always use slope 1 as lower bound
            lower_k = ((upper_k > self.leaky_alpha).to(upper_k)
                       + (upper_k <= self.leaky_alpha).to(upper_k)
                          * self.leaky_alpha)
        else:
            # adaptive
            if self.leaky_alpha == 0:
                lower_k = (upper_k > 0.5).to(upper_k)
            else:
                # FIXME this may not be optimal for leaky relu
                lower_k = ((upper_k > 0.5).to(upper_k)
                           + (upper_k <= 0.5).to(upper_k) * self.leaky_alpha)
        return lower_k

    def _relu_upper_opt_same_slope(self, lb_lower_d, ub_lower_d, upper_d, lower, upper):
        """
        When "same-slope" option is enabled in CROWN-Optimized method, lower_d is get directly
        from the optimizable paramters, so we force upper_d to be same as lower_d.

        We want the same-slope upper bound to be as tight as possible, so it should pass one of the
        vertices of the triangular convex hull of ReLU.

        upper_d is the slopes of the upper bounds compputed with normal triangle relaxation.
        For a single element:
        - lb_lower_d > upper_d => The same-slope upper bound should pass through the left endpoint of relu;
        - lb_lower_d < upper_d => The same-slope upper bound should pass through the right endpoint of relu.
        """
        lower_y = F.relu(lower)
        upper_y = F.relu(upper)

        if lb_lower_d is None:
            lb_upper_d = lb_upper_b = None
        else:
            lb_upper_d = lb_lower_d
            b_left = lower_y - lb_upper_d * lower
            b_right = upper_y - lb_upper_d * upper
            use_left_end = (lb_lower_d >= upper_d)
            lb_upper_b = use_left_end * b_left + ~use_left_end * b_right

        if ub_lower_d is None:
            ub_upper_d = ub_upper_b = None
        else:
            ub_upper_d = ub_lower_d
            b_left = lower_y - ub_upper_d * lower
            b_right = upper_y - ub_upper_d * upper
            use_left_end = (ub_lower_d >= upper_d)
            ub_upper_b = use_left_end * b_left + ~use_left_end * b_right

        return lb_upper_d, lb_upper_b, ub_upper_d, ub_upper_b


    def _forward_relaxation(self, x):
        self._init_masks(x)
        self.mask_pos = self.mask_pos.to(x.lower)
        self.mask_both = self.mask_both.to(x.lower)

        upper_k, upper_b = self._relu_upper_bound(
            x.lower, x.upper, self.leaky_alpha)
        self.uw = self.mask_pos + self.mask_both * upper_k
        self.ub = self.mask_both * upper_b

        if self.opt_stage in ['opt', 'reuse']:
            # Each actual alpha in the forward mode has shape (batch_size, *relu_node_shape].
            # But self.alpha has shape (2, output_shape, batch_size, *relu_node_shape]
            # and we do not need its first two dimensions.
            lower_k = self.alpha['_forward'][0, 0]
        else:
            lower_k = self._relu_lower_bound_init(upper_k)

        # NOTE #FIXME Saved for initialization bounds for optimization.
        # In the backward mode, same-slope bounds are used.
        # But here it is using adaptive bounds which seem to be better
        # for nn4sys benchmark with loose input bounds. Need confirmation
        # for other cases.
        self.lower_d = lower_k.detach() # saved for initializing optimized bounds

        self.lw = self.mask_both * lower_k + self.mask_pos

    def bound_dynamic_forward(self, x, max_dim=None, offset=0):
        if self.leaky_alpha > 0:
            raise NotImplementedError

        if not hasattr(self, 'upper_k'):
            # x.lower and x.upper remain same all the time,
            # so the following only need to do once
            self.upper_k, self.upper_b = self._relu_upper_bound(
                x.lower, x.upper, self.leaky_alpha)
            self.upper_b /= 2

            self.device = x.lw.device
            self.batch_size = x.lower.shape[0]
            self.unstable = torch.logical_and(x.lower < 0, x.upper > 0).view(self.batch_size, -1).to(torch.int)
            self.tot_dim = x.tot_dim + int(self.unstable.sum(dim=-1).max())

            self.b_new = self.upper_k * x.lb + self.upper_b

        b_new = self.b_new
        batch_size = self.batch_size
        device = self.device
        unstable = self.unstable
        if x.lw.shape[1]:
            # Compute only when x.lw is not empty
            w_new = self.upper_k.unsqueeze(1) * x.lw
        else:
            w_new = torch.empty_like(x.lw)

        if offset + w_new.shape[1] < x.tot_dim:
            return LinearBound(
                w_new, b_new, w_new, b_new, x_L=x.x_L, x_U=x.x_U, tot_dim=self.tot_dim)

        # Create new variables for unstable ReLU
        index = torch.cumsum(unstable, dim=-1).to(torch.int64)
        index = (index - (offset + w_new.shape[1] - x.tot_dim)).clamp(min=0)
        num_new_dim = int(index.max())
        num_new_dim_actual = min(num_new_dim, max_dim - w_new.shape[1])
        index = index.clamp(max=num_new_dim_actual+1)
        w_unstable = torch.zeros(batch_size, num_new_dim_actual + 2, unstable.size(-1), device=device)
        x_L_unstable = -torch.ones(batch_size, num_new_dim_actual, device=device)
        x_U_unstable = torch.ones(batch_size, num_new_dim_actual, device=device)
        w_unstable.scatter_(dim=1, index=index.unsqueeze(1), src=self.upper_b.view(batch_size, 1, -1), reduce='add')
        w_unstable = w_unstable[:, 1:-1].view(batch_size, num_new_dim_actual, *w_new.shape[2:])

        w_new = torch.cat([w_new, w_unstable], dim=1)
        x_L_new = torch.cat([x.x_L, x_L_unstable], dim=-1)
        x_U_new = torch.cat([x.x_U, x_U_unstable], dim=-1)

        return LinearBound(
            w_new, b_new, w_new, b_new, x_L=x_L_new, x_U=x_U_new, tot_dim=self.tot_dim)

    def bound_forward(self, dim_in, x):
        self._forward_relaxation(x)
        lb = self.lw * x.lb
        ub = self.uw * x.ub + self.ub
        lw = (self.lw.unsqueeze(1) * x.lw) if x.lw is not None else None
        uw = (self.uw.unsqueeze(1) * x.uw) if x.uw is not None else None
        if not lw.requires_grad:
            del self.mask_both, self.mask_pos
            del self.lw, self.uw, self.ub
        return LinearBound(lw, lb, uw, ub)

    @staticmethod
    @torch.jit.script
    def _relu_upper_bound(lb, ub, leaky_alpha: float):
        """Upper bound slope and intercept according to CROWN relaxation."""
        lb_r = lb.clamp(max=0)
        ub_r = ub.clamp(min=0)
        ub_r = torch.max(ub_r, lb_r + 1e-8)
        if leaky_alpha > 0:
            upper_d = (ub_r - leaky_alpha * lb_r) / (ub_r - lb_r)
            upper_b = - lb_r * upper_d + leaky_alpha * lb_r
        else:
            upper_d = ub_r / (ub_r - lb_r)
            upper_b = - lb_r * upper_d
        return upper_d, upper_b

    @staticmethod
    def _relu_mask_alpha(lower, upper, lb_lower_d : Optional[Tensor],
                         ub_lower_d : Optional[Tensor], leaky_alpha : float = 0,
                        ) -> Tuple[Optional[Tensor], Optional[Tensor], Tensor]:
        lower_mask = (lower >= 0).requires_grad_(False).to(lower.dtype)
        upper_mask = (upper <= 0).requires_grad_(False)
        if leaky_alpha > 0:
            zero_coeffs = False
        else:
            zero_coeffs = upper_mask.all()
        no_mask = (1. - lower_mask) * (1. - upper_mask.to(upper.dtype))
        if lb_lower_d is not None:
            lb_lower_d = (
                torch.clamp(lb_lower_d, min=leaky_alpha, max=1.) * no_mask
                + lower_mask)
            if leaky_alpha > 0:
                lb_lower_d += upper_mask * leaky_alpha
        if ub_lower_d is not None:
            ub_lower_d = (
                torch.clamp(ub_lower_d, min=leaky_alpha, max=1.) * no_mask
                + lower_mask)
            if leaky_alpha > 0:
                ub_lower_d += upper_mask * leaky_alpha
        return lb_lower_d, ub_lower_d, zero_coeffs

    def _backward_relaxation(self, last_lA, last_uA, x, start_node, unstable_idx):
        # Usage of output constraints requires access to bounds of the previous iteration
        # (see _clear_and_set_new)
        if x is not None:
            lower = x.lower
            upper = x.upper
        else:
            lower = self.lower
            upper = self.upper

        # Upper bound slope and intercept according to CROWN relaxation.
        upper_d, upper_b = self._relu_upper_bound(lower, upper, self.leaky_alpha)

        flag_expand = False

        ub_lower_d = lb_lower_d = None
        ub_upper_d = lb_upper_d = None
        ub_upper_b = lb_upper_b = None

        lower_b = None  # ReLU does not have lower bound intercept (=0).
        alpha_lookup_idx = None  # For sparse-spec alpha.
        if self.opt_stage in ['opt', 'reuse']:
            # Alpha-CROWN.
            lower_d = None
            selected_alpha, alpha_lookup_idx = self.select_alpha_by_idx(
                last_lA, last_uA, unstable_idx, start_node)
            # The first dimension is lower/upper intermediate bound.
            if last_lA is not None:
                lb_lower_d = selected_alpha[0]
            if last_uA is not None:
                ub_lower_d = selected_alpha[1]

            if self.alpha_indices is not None:
                # Sparse alpha on the hwc dimension. We store slopes for unstable neurons in this layer only.
                # Recover to full alpha first.
                sparse_alpha_shape = lb_lower_d.shape if lb_lower_d is not None else ub_lower_d.shape
                full_alpha_shape = sparse_alpha_shape[:-1] + self.shape
                if lb_lower_d is not None:
                    lb_lower_d = self.reconstruct_full_alpha(
                        lb_lower_d, full_alpha_shape, self.alpha_indices)
                if ub_lower_d is not None:
                    ub_lower_d = self.reconstruct_full_alpha(
                        ub_lower_d, full_alpha_shape, self.alpha_indices)

            lb_lower_d, ub_lower_d, zero_coeffs = self._relu_mask_alpha(lower, upper, lb_lower_d, ub_lower_d, leaky_alpha=self.leaky_alpha)
            self.zero_backward_coeffs_l = self.zero_backward_coeffs_u = zero_coeffs
            flag_expand = True  # we already have the spec dimension.

            if self.relu_options == "same-slope":
                # same-slope with optimized lower_d
                # We force upper_d to be the same as lower_d, and compute the corresponding upper_b
                lb_upper_d, lb_upper_b, ub_upper_d, ub_upper_b = self._relu_upper_opt_same_slope(lb_lower_d, ub_lower_d, upper_d, lower, upper)

        else:
            # FIXME: the shape can be incorrect if unstable_idx is not None.
            # This will cause problem if some ReLU layers are optimized, some are not.
            lower_d = self._relu_lower_bound_init(upper_d)

        # Upper bound always needs an extra specification dimension, since they only depend on lb and ub.
        upper_d = upper_d.unsqueeze(0)
        upper_b = upper_b.unsqueeze(0)
        if not flag_expand:
            # FIXME: The following lines seem unused since
            # flag_expand must be true when self.optstage in ['opt, 'reuse']
            if self.opt_stage in ['opt', 'reuse']:
                # We have different slopes for lower and upper bounds propagation.
                lb_lower_d = lb_lower_d.unsqueeze(0) if last_lA is not None else None
                ub_lower_d = ub_lower_d.unsqueeze(0) if last_uA is not None else None

                if self.relu_options == "same-slope":
                    upper_d = None
                    lb_upper_d = lb_upper_d.unsqueeze(0) if last_lA is not None else None
                    lb_upper_b = lb_upper_b.unsqueeze(0) if last_lA is not None else None
                    ub_upper_d = ub_upper_d.unsqueeze(0) if last_uA is not None else None
                    ub_upper_b = ub_upper_b.unsqueeze(0) if last_uA is not None else None
            else:
                lower_d = lower_d.unsqueeze(0)

        if self.opt_stage in ['opt', 'reuse'] and self.relu_options == "same-slope":
            # Remove upper_d and upper_b to avoid confusion later
            upper_d = None
            upper_b = None

        return (upper_d, upper_b, lower_d, lower_b, lb_lower_d, ub_lower_d,
                lb_upper_d, ub_upper_d, lb_upper_b, ub_upper_b, alpha_lookup_idx)

    def interval_propagate(self, *v):
        h_L, h_U = v[0][0], v[0][1]
        return self.forward(h_L), self.forward(h_U)

    def build_solver(self, *v, model, C=None, model_type="mip", solver_pkg="gurobi"):
        if self.leaky_alpha > 0:
            raise NotImplementedError

        # e.g., last layer input gurobi vars (8,16,16)
        gvars_array = np.array(v[0])
        this_layer_shape = gvars_array.shape
        assert gvars_array.shape == self.output_shape[1:]

        pre_lbs = self.inputs[0].lower.cpu().detach().numpy().reshape(-1)
        pre_ubs = self.inputs[0].upper.cpu().detach().numpy().reshape(-1)

        new_layer_gurobi_vars = []
        relu_integer_vars = []
        new_relu_layer_constrs = []
        # predefined zero variable shared in the whole solver model
        zero_var = model.getVarByName("zero")

        for neuron_idx, pre_var in enumerate(gvars_array.reshape(-1)):
            pre_ub = pre_ubs[neuron_idx]
            pre_lb = pre_lbs[neuron_idx]

            if pre_lb >= 0:
                # ReLU is always passing
                var = pre_var
            elif pre_ub <= 0:
                var = zero_var
            else:
                ub = pre_ub

                var = model.addVar(ub=ub, lb=0,
                                   obj=0,
                                   vtype=grb.GRB.CONTINUOUS,
                                   name=f'ReLU{self.name}_{neuron_idx}')

                if model_type == "mip" or model_type == "lp_integer":
                    # binary indicator
                    if model_type == "mip":
                        a = model.addVar(vtype=grb.GRB.BINARY, name=f'aReLU{self.name}_{neuron_idx}')
                    elif model_type == "lp_integer":
                        a = model.addVar(ub=1, lb=0, vtype=grb.GRB.CONTINUOUS, name=f'aReLU{self.name}_{neuron_idx}')
                    relu_integer_vars.append(a)

                    new_relu_layer_constrs.append(
                        model.addConstr(pre_var - pre_lb * (1 - a) >= var,
                                        name=f'ReLU{self.name}_{neuron_idx}_a_0'))
                    new_relu_layer_constrs.append(
                        model.addConstr(var >= pre_var, name=f'ReLU{self.name}_{neuron_idx}_a_1'))
                    new_relu_layer_constrs.append(
                        model.addConstr(pre_ub * a >= var, name=f'ReLU{self.name}_{neuron_idx}_a_2'))

                elif model_type == "lp":
                    new_relu_layer_constrs.append(
                        model.addConstr(var >= pre_var, name=f'ReLU{self.name}_{neuron_idx}_a_0'))
                    new_relu_layer_constrs.append(model.addConstr(
                        pre_ub * pre_var - (pre_ub - pre_lb) * var >= pre_ub * pre_lb,
                        name=f'ReLU{self.name}_{neuron_idx}_a_1'))

                else:
                    print(f"gurobi model type {model_type} not supported!")

            new_layer_gurobi_vars.append(var)

        new_layer_gurobi_vars = np.array(new_layer_gurobi_vars).reshape(this_layer_shape).tolist()
        if model_type in ["mip", "lp_integer"]:
            self.integer_vars = relu_integer_vars
        self.solver_vars = new_layer_gurobi_vars
        self.solver_constrs = new_relu_layer_constrs
        model.update()

    def build_gradient_node(self, grad_upstream):
        if self.leaky_alpha > 0:
            raise NotImplementedError
        node_grad = ReLUGrad()
        grad_input = (grad_upstream, self.inputs[0].forward_value)
        # An extra node is needed to consider the state of ReLU activation
        grad_extra_nodes = [self.inputs[0]]
        return [(node_grad, grad_input, grad_extra_nodes)]

    def get_split_mask(self, lower, upper, input_index):
        assert input_index == 0
        return torch.logical_and(lower < 0, upper > 0)

    # Return unstable mask to determine which neuron should use constraints_solving concretization
    def get_unstable_mask(self, lower, upper):
        """Return a mask to indicate if each neuron is unstable.

        0: Stable (linear) neuron; 1: unstable (nonlinear) neuron.
        """
        return torch.logical_and(lower < 0, upper > 0)

    # Return heuristic to select which neuron should use constraints_solving concretization
    def compute_bound_improvement_heuristics(self, lower, upper):
        """Return a heuristic score for each lower-upper bound pair.
        It indicates the possible bound improvement for each neuron.
        We will then choose if a neuron's bound needs further tightened based on the heuristic 
        """
        # This heuristic is actually BaBSR-interception-only.
        return (-lower * upper).clamp(min=0) / (upper - lower + 1e-8).abs()

class BoundLeakyRelu(BoundRelu):
    pass


class BoundSign(BoundActivation):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.splittable = True

    def forward(self, x):
        return torch.sign(x)

    def bound_relax(self, x, init=False):
        if init:
            self.init_linear_relaxation(x)
        mask_0 = torch.logical_and(x.lower == 0, x.upper == 0)
        mask_pos_0 = torch.logical_and(x.lower == 0, x.upper > 0)
        mask_neg_0 = torch.logical_and(x.lower < 0, x.upper == 0)
        mask_pos = x.lower > 0
        mask_neg = x.upper < 0
        mask_both = torch.logical_not(torch.logical_or(torch.logical_or(
            mask_0, torch.logical_or(mask_pos, mask_pos_0)),
            torch.logical_or(mask_neg, mask_neg_0)))
        self.add_linear_relaxation(mask=mask_0, type='lower',
            k=0, x0=torch.zeros_like(x.upper, requires_grad=True), y0=0)
        self.add_linear_relaxation(mask=mask_0, type='upper',
            k=0, x0=torch.zeros_like(x.upper, requires_grad=True), y0=0)

        self.add_linear_relaxation(mask=mask_pos_0, type='lower',
            k=1/x.upper.clamp(min=1e-8), x0=torch.zeros_like(x.upper), y0=0)
        self.add_linear_relaxation(mask=torch.logical_or(mask_pos_0, mask_pos), type='upper',
            k=0, x0=torch.zeros_like(x.upper, requires_grad=True), y0=1)

        self.add_linear_relaxation(mask=torch.logical_or(mask_neg_0, mask_neg), type='lower',
            k=0, x0=torch.zeros_like(x.upper, requires_grad=True), y0=-1)
        self.add_linear_relaxation(mask=mask_neg_0, type='upper',
            k=-1/x.lower.clamp(max=-1e-8), x0=torch.zeros_like(x.upper), y0=0)

        self.add_linear_relaxation(mask=mask_pos, type='lower', k=0, x0=torch.zeros_like(x.upper, requires_grad=True), y0=1)
        self.add_linear_relaxation(mask=mask_neg, type='upper', k=0, x0=torch.zeros_like(x.upper, requires_grad=True), y0=-1)
        self.add_linear_relaxation(mask=mask_both, type='lower', k=0, x0=torch.zeros_like(x.upper, requires_grad=True), y0=-1)
        self.add_linear_relaxation(mask=mask_both, type='upper', k=0, x0=torch.zeros_like(x.upper, requires_grad=True), y0=1)


class SignMergeFunction_loose(torch.autograd.Function):
    # Modified SignMerge operator.
    # Change its backward function so that the "gradient" can be used for pgd attack
    @staticmethod
    def forward(ctx, input):
        ctx.save_for_backward(input)
        output = torch.sign(torch.sign(input) + 1e-1)
        return output

    @staticmethod
    def backward(ctx, grad_output):
        eps = 5     # should be carefully chosen
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[abs(input) >= eps] = 0
        grad_input /= eps
        return grad_input

class SignMergeFunction_tight(torch.autograd.Function):
    # Modified SignMerge operator.
    # Change its backward function so that the "gradient" can be used for pgd attack
    @staticmethod
    def forward(ctx, input):
        ctx.save_for_backward(input)
        output = torch.sign(torch.sign(input) + 1e-1)
        return output

    @staticmethod
    def backward(ctx, grad_output):
        eps = 0.1     # should be carefully chosen
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[abs(input) >= eps] = 0
        grad_input /= eps
        return grad_input


class BoundSignMerge(BoundTwoPieceLinear):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.alpha_size = 4
        self.loose_function = SignMergeFunction_loose
        self.tight_function = SignMergeFunction_tight
        self.signmergefunction = self.tight_function    # default

    def get_unstable_idx(self):
        self.alpha_indices = torch.logical_and(
            self.inputs[0].lower < 0, self.inputs[0].upper >= 0).any(dim=0).nonzero(as_tuple=True)

    def forward(self, x):
        self.shape = x.shape[1:]
        return self.signmergefunction.apply(x)

    def _mask_alpha(self, lower, upper, lb_lower_d, ub_lower_d, lb_upper_d, ub_upper_d):
        lower_mask = (lower >= 0.).requires_grad_(False).to(lower.dtype)
        upper_mask = (upper < 0.).requires_grad_(False).to(upper.dtype)
        no_mask = 1. - (lower_mask + upper_mask)
        if lb_lower_d is not None:
            lb_lower_d = torch.min(lb_lower_d, 2/upper.clamp(min=1e-8))
            lb_lower_d = torch.clamp(lb_lower_d, min=0) * no_mask
            lb_upper_d = torch.min(lb_upper_d, -2/lower.clamp(max=-1e-8))
            lb_upper_d = torch.clamp(lb_upper_d, min=0) * no_mask
        if ub_lower_d is not None:
            ub_lower_d = torch.min(ub_lower_d, 2/upper.clamp(min=1e-8))
            ub_lower_d = torch.clamp(ub_lower_d, min=0) * no_mask
            ub_upper_d = torch.min(ub_upper_d, -2/lower.clamp(max=-1e-8))
            ub_upper_d = torch.clamp(ub_upper_d, min=0) * no_mask
        return lb_lower_d, ub_lower_d, lb_upper_d, ub_upper_d

    def _backward_relaxation(self, last_lA, last_uA, x, start_node, unstable_idx):
        if x is not None:
            lower, upper = x.lower, x.upper
        else:
            lower, upper = self.lower, self.upper

        flag_expand = False
        ub_lower_d = lb_lower_d = lb_upper_d = ub_upper_d = None
        alpha_lookup_idx = None  # For sparse-spec alpha.
        if self.opt_stage in ['opt', 'reuse']:
            # Alpha-CROWN.
            upper_d = lower_d = None
            selected_alpha, alpha_lookup_idx = self.select_alpha_by_idx(
                last_lA, last_uA, unstable_idx, start_node)
            # The first dimension is lower/upper intermediate bound.
            if last_lA is not None:
                lb_lower_d = selected_alpha[0]
                lb_upper_d = selected_alpha[2]
            if last_uA is not None:
                ub_lower_d = selected_alpha[1]
                ub_upper_d = selected_alpha[3]

            if self.alpha_indices is not None:
                # Sparse alpha on the hwc dimension. We store slopes for unstable neurons in this layer only.
                # Recover to full alpha first.
                sparse_alpha_shape = lb_lower_d.shape if lb_lower_d is not None else ub_lower_d.shape
                full_alpha_shape = sparse_alpha_shape[:-1] + self.shape
                if lb_lower_d is not None:
                    lb_lower_d = self.reconstruct_full_alpha(
                        lb_lower_d, full_alpha_shape, self.alpha_indices)
                    lb_upper_d = self.reconstruct_full_alpha(
                        lb_upper_d, full_alpha_shape, self.alpha_indices)
                if ub_lower_d is not None:
                    ub_lower_d = self.reconstruct_full_alpha(
                        ub_lower_d, full_alpha_shape, self.alpha_indices)
                    ub_upper_d = self.reconstruct_full_alpha(
                        ub_upper_d, full_alpha_shape, self.alpha_indices)

            lb_lower_d, ub_lower_d, lb_upper_d, ub_upper_d = self._mask_alpha(lower, upper,
                lb_lower_d, ub_lower_d, lb_upper_d, ub_upper_d)
            flag_expand = True  # we already have the spec dimension.
        else:
            lower_d = torch.zeros_like(upper, requires_grad=True)
            upper_d = torch.zeros_like(upper, requires_grad=True)

        mask_pos = (x.lower >= 0.).requires_grad_(False).to(x.lower.dtype)
        mask_neg = (x.upper < 0.).requires_grad_(False).to(x.upper.dtype)
        lower_b = (-1 * (1 - mask_pos) + mask_pos).unsqueeze(0)
        upper_b = (-1 * mask_neg + (1 - mask_neg)).unsqueeze(0)

        # Upper bound always needs an extra specification dimension, since they only depend on lb and ub.
        if not flag_expand:
            if self.opt_stage in ['opt', 'reuse']:
                # We have different slopes for lower and upper bounds propagation.
                lb_lower_d = lb_lower_d.unsqueeze(0) if last_lA is not None else None
                ub_lower_d = ub_lower_d.unsqueeze(0) if last_uA is not None else None
                lb_upper_d = lb_lower_d.unsqueeze(0) if last_lA is not None else None
                ub_upper_d = ub_lower_d.unsqueeze(0) if last_uA is not None else None
            else:
                lower_d = lower_d.unsqueeze(0)
                upper_d = upper_d.unsqueeze(0)
        return (upper_d, upper_b, lower_d, lower_b, lb_lower_d, ub_lower_d,
            lb_upper_d, ub_upper_d, None, None, alpha_lookup_idx)

    def build_solver(self, *v, model, C=None, model_type="mip", solver_pkg="gurobi"):

        # e.g., last layer input gurobi vars (8,16,16)
        gvars_array = np.array(v[0])
        this_layer_shape = gvars_array.shape
        assert gvars_array.shape == self.output_shape[1:]

        pre_lbs = self.inputs[0].lower.cpu().detach().numpy().reshape(-1)
        pre_ubs = self.inputs[0].upper.cpu().detach().numpy().reshape(-1)

        new_layer_gurobi_vars = []
        integer_vars = []
        layer_constrs = []
        # predefined zero variable shared in the whole solver model
        one_var = model.getVarByName("one")
        neg_one_var = model.getVarByName("neg_one")

        for neuron_idx, pre_var in enumerate(gvars_array.reshape(-1)):
            pre_ub = pre_ubs[neuron_idx]
            pre_lb = pre_lbs[neuron_idx]

            if pre_lb >= 0:
                var = one_var
            elif pre_ub < 0:
                var = neg_one_var
            else:
                ub = pre_ub

                var = model.addVar(ub=ub, lb=pre_lb,
                                   obj=0,
                                   vtype=grb.GRB.CONTINUOUS,
                                   name=f'Sign{self.name}_{neuron_idx}')

                a = model.addVar(vtype=grb.GRB.BINARY, name=f'aSign{self.name}_{neuron_idx}')
                integer_vars.append(a)

                layer_constrs.append(
                    model.addConstr(pre_lb * a <= pre_var, name=f'Sign{self.name}_{neuron_idx}_a_0'))
                layer_constrs.append(
                    model.addConstr(pre_ub * (1 - a) >= pre_var, name=f'Sign{self.name}_{neuron_idx}_a_1'))
                layer_constrs.append(
                    model.addConstr(var == 1 - 2*a, name=f'Sign{self.name}_{neuron_idx}_a_2'))

            new_layer_gurobi_vars.append(var)

        new_layer_gurobi_vars = np.array(new_layer_gurobi_vars).reshape(this_layer_shape).tolist()
        if model_type in ["mip", "lp_integer"]:
            self.integer_vars = integer_vars
        self.solver_vars = new_layer_gurobi_vars
        self.solver_constrs = layer_constrs
        model.update()


def relu_grad(preact):
    return (preact > 0).float()


class ReLUGradOp(Function):
    """ Local gradient of ReLU.

    Not including multiplication with gradients from other layers.
    """
    @staticmethod
    def symbolic(_, g, g_relu, g_relu_rev, preact):
        return _.op('grad::Relu', g, g_relu, g_relu_rev, preact).setType(g.type())

    @staticmethod
    def forward(ctx, g, g_relu, g_relu_rev, preact):
        return g * relu_grad(preact)


class ReLUGrad(Module):
    def forward(self, g, preact):
        g_relu = F.relu(g)
        g_relu_rev = -F.relu(-g)
        return ReLUGradOp.apply(g, g_relu, g_relu_rev, preact)

# FIXME reuse the function from auto_LiRPA.patches
def _maybe_unfold(d_tensor, last_A):
    if d_tensor is None:
        return None

    #[batch, out_dim, in_c, in_H, in_W]
    d_shape = d_tensor.size()

    # Reshape to 4-D tensor to unfold.
    #[batch, out_dim*in_c, in_H, in_W]
    d_tensor = d_tensor.view(d_shape[0], -1, *d_shape[-2:])
    # unfold the slope matrix as patches.
    # Patch shape is [batch, out_h, out_w, out_dim*in_c, H, W).
    d_unfolded = inplace_unfold(
        d_tensor, kernel_size=last_A.patches.shape[-2:], stride=last_A.stride,
        padding=last_A.padding)
    # Reshape to [batch, out_H, out_W, out_dim, in_C, H, W]
    d_unfolded_r = d_unfolded.view(
        *d_unfolded.shape[:3], d_shape[1], *d_unfolded.shape[-2:])
    if last_A.unstable_idx is not None:
        if len(last_A.unstable_idx) == 4:
            # [batch, out_H, out_W, out_dim, in_C, H, W]
            # to [out_H, out_W, batch, out_dim, in_C, H, W]
            d_unfolded_r = d_unfolded_r.permute(1, 2, 0, 3, 4, 5, 6)
            d_unfolded_r = d_unfolded_r[
                last_A.unstable_idx[2], last_A.unstable_idx[3]]
        else:
            raise NotImplementedError
    # For sparse patches, the shape after unfold is
    # (unstable_size, batch_size, in_c, H, W).
    # For regular patches, the shape after unfold is
    # (spec, batch, out_h, out_w, in_c, H, W).
    return d_unfolded_r


class BoundReluGrad(BoundActivation):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.requires_input_bounds = [3]
        self.recurjac = options.get('recurjac', False)

    @staticmethod
    def relu_grad(preact):
        return (preact > 0).float()

    def forward(self, g, g_relu, g_relu_rev, preact):
        if g.ndim == preact.ndim + 1:
            preact = preact.unsqueeze(1)
        return g * relu_grad(preact)

    def interval_propagate(self, *v):
        g_lower, g_upper = v[0]
        preact_lower, preact_upper = v[3]
        relu_grad_lower = relu_grad(preact_lower)
        relu_grad_upper = relu_grad(preact_upper)
        if g_lower.ndim == relu_grad_lower.ndim + 1:
            relu_grad_lower = relu_grad_lower.unsqueeze(1)
            relu_grad_upper = relu_grad_upper.unsqueeze(1)
        lower = torch.min(g_lower * relu_grad_lower, g_lower * relu_grad_upper)
        upper = torch.max(g_upper * relu_grad_lower, g_upper * relu_grad_upper)
        return lower, upper

    def bound_backward(self, last_lA, last_uA, g, g_relu, g_relu_rev, preact,
                       **kwargs):
        mask_active = (preact.lower > 0).float()
        mask_inactive = (preact.upper < 0).float()
        mask_unstable = 1 - mask_active - mask_inactive

        if self.recurjac and self.inputs[0].perturbed:
            upper_grad = preact.upper >= 0
            lower_interval = self.inputs[0].lower * upper_grad
            upper_interval = self.inputs[0].upper * upper_grad
        else:
            lower_interval = upper_interval = None

        def _bound_oneside(last_A, pos_interval=None, neg_interval=None):
            if last_A is None:
                return None, None, None, 0

            if isinstance(last_A, torch.Tensor):
                if self.recurjac and self.inputs[0].perturbed:
                    mask_unstable_grad = (
                        (self.inputs[0].lower < 0) * (self.inputs[0].upper > 0))
                    last_A_unstable = last_A * mask_unstable_grad
                    bias = (
                        last_A_unstable.clamp(min=0) * pos_interval
                        + last_A_unstable.clamp(max=0) * neg_interval)
                    bias = bias.reshape(
                        bias.shape[0], bias.shape[1], -1).sum(dim=-1)
                    last_A = last_A * torch.logical_not(mask_unstable_grad)
                else:
                    bias = 0
                A = last_A * mask_active
                A_pos = last_A.clamp(min=0) * mask_unstable
                A_neg = last_A.clamp(max=0) * mask_unstable
                return A, A_pos, A_neg, bias
            elif isinstance(last_A, Patches):
                last_A_patches = last_A.patches

                if self.recurjac and self.inputs[0].perturbed:
                    mask_unstable_grad = (
                        (self.inputs[0].lower < 0) * (self.inputs[0].upper > 0))
                    mask_unstable_grad_unfold = _maybe_unfold(
                        mask_unstable_grad, last_A)
                    last_A_unstable = (
                        last_A.to_matrix(mask_unstable_grad.shape)
                        * mask_unstable_grad)
                    bias = (
                        last_A_unstable.clamp(min=0) * pos_interval
                        + last_A_unstable.clamp(max=0) * neg_interval)
                    # FIXME Clean up patches. This implementation does not seem
                    # to support general shapes.
                    assert bias.ndim == 5
                    bias = bias.sum(dim=[-1, -2, -3]).view(-1, 1)
                    last_A_patches = (
                        last_A_patches
                        * torch.logical_not(mask_unstable_grad_unfold))
                else:
                    bias = 0

                # need to unfold mask_active and mask_unstable
                # [batch, 1, in_c, in_H, in_W]
                mask_active_unfold = _maybe_unfold(mask_active, last_A)
                mask_unstable_unfold = _maybe_unfold(mask_unstable, last_A)
                # [spec, batch, 1, in_c, in_H, in_W]

                mask_active_unfold = mask_active_unfold.expand(last_A.shape)
                mask_unstable_unfold = mask_unstable_unfold.expand(last_A.shape)

                A = Patches(
                    last_A_patches * mask_active_unfold,
                    last_A.stride, last_A.padding, last_A.shape,
                    last_A.identity, last_A.unstable_idx, last_A.output_shape)

                A_pos_patches = last_A_patches.clamp(min=0) * mask_unstable_unfold
                A_neg_patches = last_A_patches.clamp(max=0) * mask_unstable_unfold

                A_pos = Patches(
                    A_pos_patches, last_A.stride, last_A.padding, last_A.shape,
                    last_A.identity, last_A.unstable_idx, last_A.output_shape)
                A_neg = Patches(
                    A_neg_patches, last_A.stride, last_A.padding, last_A.shape,
                    last_A.identity, last_A.unstable_idx, last_A.output_shape)

                return A, A_pos, A_neg, bias

        lA, lA_pos, lA_neg, lbias = _bound_oneside(
            last_lA, pos_interval=lower_interval, neg_interval=upper_interval)
        uA, uA_pos, uA_neg, ubias = _bound_oneside(
            last_uA, pos_interval=upper_interval, neg_interval=lower_interval)

        return (
            [(lA, uA), (lA_neg, uA_pos), (lA_pos, uA_neg), (None, None)],
            lbias, ubias)


================================================
FILE: auto_LiRPA/operators/reshape.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
from torch.nn import Module
from .base import *
from ..patches import Patches, patches_to_matrix
from .linear import BoundLinear
from .constant import BoundConstant


class BoundReshape(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        # It can be set to `view`, so that `view` instead of `reshape` will be used.
        self.option = options.get('reshape', 'reshape')

    def forward(self, x, shape):
        shape = list(shape)
        for i in range(len(shape)):
            if shape[i] == -1:
                shape[i] = prod(x.shape) // int(prod(shape[:i]) * prod(shape[(i + 1):]))
        self.shape = shape
        if self.option == 'view':
            return x.contiguous().view(shape)
        else:
            return x.reshape(shape)

    def bound_backward(self, last_lA, last_uA, x, shape, **kwargs):
        def _bound_oneside(A):
            if A is None:
                return None
            if type(A) == Patches:
                # output shape should be [batch, in_c, in_H, in_W] since it's followed by Conv2d
                assert len(self.output_shape) == 4
                if type(self.inputs[0]) == BoundLinear:
                    # Save the shape and it will be converted to matrix in Linear layer.
                    return A.create_similar(input_shape=self.output_shape)
                if A.unstable_idx is None:
                    patches = A.patches
                    # non-sparse: [batch, out_dim, out_c, out_H, out_W, out_dim, in_c, H, W]
                    # [out_dim*out_c, batch, out_H, out_W, out_dim*in_c, H, W]
                    # expected next_A shape [batch, spec, in_c, in_H , in_W].
                    next_A = patches_to_matrix(
                        pieces=patches, input_shape=self.output_shape,
                        stride=A.stride, padding=A.padding)
                else:
                    # sparse: [spec, batch, in_c, patch_H, patch_W] (specs depends on the number of unstable neurons).
                    patches = A.patches
                    # expected next_A shape [batch, spec, input_c, in_H, in_W].
                    next_A = patches_to_matrix(
                        pieces=patches, input_shape=self.output_shape,
                        stride=A.stride, padding=A.padding, 
                        output_shape=A.output_shape, unstable_idx=A.unstable_idx)
                # Reshape it to [spec, batch, *input_shape]  (input_shape is the shape before Reshape operation).
                return next_A.transpose(0, 1).reshape(-1, A.shape[1], *self.input_shape[1:])
            else:
                return A.reshape(A.shape[0], A.shape[1], *self.input_shape[1:])
        #FIXME check reshape or view
        return [(_bound_oneside(last_lA), _bound_oneside(last_uA)), (None, None)], 0, 0

    def bound_forward(self, dim_in, x, shape):
        batch_size = x.lw.shape[0]
        lw = x.lw.reshape(batch_size, dim_in, *self.shape[1:])
        uw = x.uw.reshape(batch_size, dim_in, *self.shape[1:])
        lb = x.lb.reshape(batch_size, *self.shape[1:])
        ub = x.ub.reshape(batch_size, *self.shape[1:])
        return LinearBound(lw, lb, uw, ub)

    def bound_dynamic_forward(self, x, shape, max_dim=None, offset=0):
        w = x.lw.reshape(x.lw.shape[0], x.lw.shape[1], *self.shape[1:])
        b = x.lb.reshape(x.lb.shape[0], *self.shape[1:])
        return LinearBound(w, b, w, b, x_L=x.x_L, x_U=x.x_U, tot_dim=x.tot_dim)

    def interval_propagate(self, *v):
        return Interval.make_interval(
            self.forward(v[0][0], v[1][0]),
            self.forward(v[0][1], v[1][0]), v[0])

    def build_solver(self, *v, model, C=None, model_type="mip", solver_pkg="gurobi"):
        if isinstance(v[0], Tensor):
            self.solver_vars = self.forward(*v)
            return
        gvar_array = np.array(v[0])
        gvar_array = gvar_array.reshape(v[1].detach().cpu().numpy())[0]
        self.solver_vars = gvar_array.tolist()

    def build_gradient_node(self, grad_upstream):
        node_grad = ReshapeGrad()
        grad_input = (grad_upstream, self.inputs[0].forward_value)
        return [(node_grad, grad_input, [])]


class BoundUnsqueeze(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.use_default_ibp = True
        if 'axes' in attr:
            self.axes = attr['axes']
            assert len(self.axes) == 1
            self.axes = self.axes[0]
        else:
            self.axes = None

    def forward(self, *x):
        data = x[0]
        if self.axes is not None:
            axes = self.axes
        else:
            axes = x[1].item()
            self.axes = axes
        return data.unsqueeze(axes)

    def bound_backward(self, last_lA, last_uA, *x, **kwargs):
        if self.axes is not None:
            axes = self.make_axis_non_negative(self.axes, 'output')
        else:
            axes = self.make_axis_non_negative(x[1].value.item(), 'output')
        if axes == 0:
            raise ValueError("Unsqueezing with axes == 0 is not allowed")
        else:
            def squeeze_A(last_A):
                if type(last_A) == Patches:
                    return Patches(
                        last_A.patches.squeeze(axes - 5),
                        last_A.stride, last_A.padding, last_A.shape,
                        last_A.identity, last_A.unstable_idx, last_A.output_shape)
                elif last_A is not None:
                    return last_A.squeeze(axes + 1)
                else:
                    return None
            lA = squeeze_A(last_lA)
            uA = squeeze_A(last_uA)
            return [(lA, uA), (None, None)], 0, 0

    def bound_forward(self, dim_in, *x):
        axes = self.make_axis_non_negative(
            self.axes if self.axes is not None else x[1].lb.item(), 'output')
        x = x[0]
        if len(self.input_shape) == 0:
            lw, lb = x.lw.unsqueeze(1), x.lb.unsqueeze(0)
            uw, ub = x.uw.unsqueeze(1), x.ub.unsqueeze(0)
        else:
            lw, lb = x.lw.unsqueeze(axes + 1), x.lb.unsqueeze(axes)
            uw, ub = x.uw.unsqueeze(axes + 1), x.ub.unsqueeze(axes)
        return LinearBound(lw, lb, uw, ub)

    def build_solver(self, *v, model, C=None, model_type="mip", solver_pkg="gurobi"):
        self.solver_vars = self.forward(v[0])

    def build_gradient_node(self, grad_upstream):
        axes = self.make_axis_non_negative(self.axes, 'output')
        if axes == 0:
            raise ValueError("Unsqueezing with axes == 0 is not allowed")
        node_grad = UnsqueezeGrad(axes)
        return [(node_grad, (grad_upstream,), [])]


class UnsqueezeGrad(Module):
    def __init__(self, axes):
        super().__init__()
        self.axes = axes

    def forward(self, grad_last):
        return grad_last.squeeze(self.axes + 1)


class BoundExpand(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.use_default_ibp = True

    def forward(self, x, y):
        y = y.clone()
        assert y.ndim == 1
        n, m = x.ndim, y.shape[0]
        assert n <= m
        for i in range(n):
            if y[m - n + i] == 1:
                y[m - n + i] = x.shape[i]
            else:
                assert x.shape[i] == 1 or x.shape[i] == y[m - n + i]
        return x.expand(*list(y))
    
    def bound_backward(self, last_lA, last_uA, *x, **kwargs):
        assert not self.is_input_perturbed(1)
        # Although torch.expand supports prepending dimensions,
        # bound computatiion doesn't since we must always keep
        # the batch dimension at the beginning
        assert (
            len(x[0].output_shape) == len(self.output_shape)
        ), "BoundExpand with changed ndim is not supported by bound computation"
        n = len(self.output_shape)

        def _bound_oneside(A):
            if A is None:
                return None
            dims_to_sum = [i + 1 for i in range(1, n)
                           if x[0].output_shape[i] == 1 and A.shape[i + 1] > 1]
            return A.sum(dim=dims_to_sum, keepdim=True) if dims_to_sum else A
        
        return [(_bound_oneside(last_lA), _bound_oneside(last_uA)), (None, None)], 0, 0

    def bound_forward(self, dim_in, *x):
        # It doesn't support general Expand operator.
        # This is just for the Expand operator converted from torch.repeat, and here
        # it should just be an identical operator.
        shape = x[1].lb
        if not (len(x[0].lb.shape) == len(shape) and (shape == 1).all()):
            raise NotImplementedError("General onnx::Expand is not supported")
        return x[0]      

    def build_gradient_node(self, grad_upstream):
        shape = self.inputs[1].forward_value
        if not (len(self.inputs[0].output_shape) == len(shape) and (shape == 1).all()):
            raise NotImplementedError("General onnx::Expand is not supported")
        return [(ExpandGrad(shape), (grad_upstream,), []), None]


class ExpandGrad(Module):
    # It doesn't support general Expand operator.
    # This is just for the Expand operator converted from torch.repeat, and here
    # it should just be an identical operator.
    def __init__(self, shape):
        super().__init__()
        self.shape = shape

    def forward(self, grad_last):
        return grad_last


class BoundSqueeze(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.use_default_ibp = True
        if 'axes' in attr:
            self.axes = attr['axes']
            assert len(self.axes) == 1
            self.axes = self.axes[0]
        else:
            self.axes = None

    def forward(self, *x):
        data = x[0]
        if self.axes is not None:
            axes = self.axes
        else:
            axes = x[1].item()
        return data.squeeze(axes)

    def bound_backward(self, last_lA, last_uA, *x, **kwargs):
        if self.axes is not None:
            axes = self.axes
        else:
            axes = self.make_axis_non_negative(x[1].value.item(), 'input')
        if axes == 0:
            raise ValueError("Squeezing with axes == 0 is not allowed")
        return [(last_lA.unsqueeze(axes + 1) if last_lA is not None else None,
                 last_uA.unsqueeze(axes + 1) if last_uA is not None else None),
                (None, None)], 0, 0

    def bound_forward(self, dim_in, *x):
        if self.axes is not None:
            axes = self.axes
        else:
            axes = self.make_axis_non_negative(x[1].lb.item(), 'input')
        x = x[0]
        return LinearBound(
            x.lw.squeeze(axes + 1),
            x.lb.squeeze(axes),
            x.uw.squeeze(axes + 1),
            x.ub.squeeze(axes)
        )

    def build_solver(self, *v, model, C=None, model_type="mip", solver_pkg="gurobi"):
        self.solver_vars = self.forward(v[0])


class BoundFlatten(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.use_default_ibp = True
        self.axis = attr['axis']

    def forward(self, x):
        return torch.flatten(x, self.axis)

    def bound_backward(self, last_lA, last_uA, x, **kwargs):
        def _bound_oneside(A):
            if A is None:
                return None
            return A.reshape(A.shape[0], A.shape[1], *self.input_shape[1:])
        return [(_bound_oneside(last_lA), _bound_oneside(last_uA)), (None, None)], 0, 0

    def bound_dynamic_forward(self, x, max_dim=None, offset=0):
        w = torch.flatten(x.lw, self.axis + 1)
        b = torch.flatten(x.lb, self.axis)
        return LinearBound(w, b, w, b, x_L=x.x_L, x_U=x.x_U, tot_dim=x.tot_dim)

    def bound_forward(self, dim_in, x):
        self.axis = self.make_axis_non_negative(self.axis)
        assert self.axis > 0
        return LinearBound(
            torch.flatten(x.lw, self.axis + 1),
            torch.flatten(x.lb, self.axis),
            torch.flatten(x.uw, self.axis + 1),
            torch.flatten(x.ub, self.axis),
        )

    def build_solver(self, *v, model, C=None, model_type="mip", solver_pkg="gurobi"):
        # e.g., v[0] input shape (16, 8, 8) => output shape (1024,)
        self.solver_vars = np.array(v[0]).reshape(-1).tolist()
        model.update()

    def build_gradient_node(self, grad_upstream):
        node_grad = ReshapeGrad()
        grad_input = (grad_upstream, self.inputs[0].forward_value)
        return [(node_grad, grad_input, [])]


class BoundATenUnflatten(BoundReshape):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
    
    def forward(self, x, dim, sizes):
        self.dim = dim.item()
        self.sizes = sizes.tolist()
        fval = torch.unflatten(x, self.dim, self.sizes)
        self.shape = fval.shape
        return fval
    
    def bound_backward(self, last_lA, last_uA, *x, **kwargs):
        A, lbias, ubias = super().bound_backward(last_lA, last_uA, x[0], shape=None, kwargs=kwargs)
        # One more input for Unflatten
        A.append((None, None))
        return A, lbias, ubias

    def bound_forward(self, dim_in, *x):
        return super().bound_forward(dim_in=dim_in, x=x[0], shape=None)
    
    def bound_dynamic_forward(self, *x, max_dim=None, offset=0):
        return super().bound_dynamic_forward(x=x[0], shape=None, max_dim=max_dim, offset=offset)

    def interval_propagate(self, x, dim, sizes):
        return Interval.make_interval(
            self.forward(x[0], dim[0], sizes[0]),
            self.forward(x[1], dim[0], sizes[0]), x)
    
    def build_solver(self, *v, model, C=None, model_type="mip", solver_pkg="gurobi"):
        shape = torch.tensor(v[0].shape[0], *self.shape[1:])
        return super().build_solver((v[0], shape), model=model, C=C, model_type=model_type, solver_pkg=solver_pkg)


class ReshapeGrad(Module):
    def forward(self, grad_last, inp):
        if grad_last.numel() == inp.numel():
            return grad_last.reshape(grad_last.shape[0], *inp.shape[1:])
        else:
            return grad_last.reshape(*grad_last.shape[:2], *inp.shape[1:])


class BoundTranspose(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.perm = attr['perm']
        self.perm_inv_inc_one = [-1] * (len(self.perm) + 1)
        self.perm_inv_inc_one[0] = 0
        for i in range(len(self.perm)):
            self.perm_inv_inc_one[self.perm[i] + 1] = i + 1
        self.use_default_ibp = True
        self.ibp_intermediate = True

    def forward(self, x):
        return x.permute(*self.perm)

    def bound_backward(self, last_lA, last_uA, x, **kwargs):
        def _bound_oneside(last_A):
            if last_A is None:
                return None
            return last_A.permute(self.perm_inv_inc_one)

        return [(_bound_oneside(last_lA), _bound_oneside(last_uA))], 0, 0

    def bound_forward(self, dim_in, x):
        if self.input_shape[0] != 1:
            perm = [0] + [(p + 1) for p in self.perm]
        else:
            assert (self.perm[0] == 0)
            perm = [0, 1] + [(p + 1) for p in self.perm[1:]]
        lw, lb = x.lw.permute(*perm), x.lb.permute(self.perm)
        uw, ub = x.uw.permute(*perm), x.ub.permute(self.perm)

        return LinearBound(lw, lb, uw, ub)

    def build_solver(self, *v, model, C=None, model_type="mip", solver_pkg="gurobi"):
        self.solver_vars = self.forward(*v)

    def build_gradient_node(self, grad_upstream):
        node_grad = TransposeGrad(self.perm_inv_inc_one)
        grad_input = (grad_upstream,)
        return [(node_grad, grad_input, [])]


class TransposeGrad(Module):
    def __init__(self, perm_inv):
        super().__init__()
        self.perm_inv = perm_inv

    def forward(self, grad_last):
        return grad_last.permute(*self.perm_inv)


================================================
FILE: auto_LiRPA/operators/resize.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
""" Resize operator """
import torch

from .base import *
import numpy as np
from .solver_utils import grb
from ..patches import unify_shape, create_valid_mask, is_shape_used


class BoundResize(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        # only support nearest mode for now
        assert attr["mode"] == "nearest"
        self.mode = attr["mode"]
        self.scale_factor = None

    def forward(self, x, size=None, scale_factor=None):
        # currently, forwarding size is not supported.
        assert isinstance(size, torch.Tensor) and len(size.tolist()) == 0
        # currently, only support enlarge tensor size by an integer factor.
        assert len(scale_factor.tolist()) == 4 and np.array([tmp.is_integer() and tmp > 0 for tmp in scale_factor.tolist()]).all()
        assert (scale_factor[0:2].to(torch.long) == 1).all(), 'only support resize on the H and W dim'
        self.scale_factor = tuple([int(tmp) for tmp in scale_factor][2:])
        if x.ndim == 4:
            final = F.interpolate(
                x, None, self.scale_factor, mode=self.mode)
        else:
            raise NotImplementedError(
                "Interpolation in 3D or interpolation with parameter size has not been implmented.")
        return final

    def interval_propagate(self, *v):
        l, u = zip(*v)
        return Interval.make_interval(self.forward(*l), self.forward(*u), v[0])

    def bound_forward(self, dim_in, *inp):
        x = inp[0]
        lw, lb, uw, ub = x.lw, x.lb, x.uw, x.ub
        new_lw, new_lb, new_uw, new_ub = \
            torch.nn.functional.upsample(lw, scale_factor=([1] * (lw.ndim - 4)) + list(self.scale_factor), mode=self.mode), \
            torch.nn.functional.upsample(lb, scale_factor=([1] * (lb.ndim - 4)) + list(self.scale_factor), mode=self.mode), \
            torch.nn.functional.upsample(uw, scale_factor=([1] * (uw.ndim - 4)) + list(self.scale_factor), mode=self.mode), \
            torch.nn.functional.upsample(ub, scale_factor=([1] * (ub.ndim - 4)) + list(self.scale_factor), mode=self.mode)
        return LinearBound(
            lw = new_lw,
            lb = new_lb,
            uw = new_uw,
            ub = new_ub)

    def bound_backward(self, last_lA, last_uA, *x, **kwargs):

        def _bound_oneside(last_A):
            if last_A is None:
                return None
            assert type(last_A) is Patches or last_A.ndim == 5
            # in case the kernel size cannot be divided by scale_factor, we round up the shape
            split_shape = tuple((torch.tensor(
                last_A.shape)[-2:] / torch.tensor(self.scale_factor)).ceil().to(torch.long).tolist())
            new_shape = last_A.shape[:-2] + split_shape
            if not type(last_A) is Patches:
                # classical mode is simple to handle by
                # sum the grid elements by using avg_pool2d with divisor_override=1
                return torch.nn.functional.avg_pool2d(
                    last_A.reshape(-1, *last_A.shape[-2:]), kernel_size=self.scale_factor, stride=self.scale_factor,
                    divisor_override=1).reshape(new_shape)
            else:
                # for patches mode
                assert type(last_A) is Patches
                assert self.scale_factor[0] == self.scale_factor[1]
                if self.scale_factor[0] == 1:
                    # identity upsampling
                    return last_A
                if isinstance(last_A.padding, int) and last_A.padding % self.scale_factor[0] == 0 and last_A.stride % self.scale_factor[0] == 0 and last_A.inserted_zeros == 0:
                    # an easy case where patch sliding windows coincides with the nearest sampling scaling windows
                    # in this case, we divide each patch to size of scale_factor sub-matrices,
                    # and sum up each sub-matrices respectively
                    # print(last_A.shape)
                    padding = last_A.shape[-1] % self.scale_factor[-1]
                    new_patches = torch.nn.functional.pad(last_A.patches, (0, padding, 0, padding))
                    new_patches = torch.nn.functional.avg_pool2d(
                        new_patches.reshape(-1, *new_patches.shape[-2:]), kernel_size=self.scale_factor,
                        stride=self.scale_factor, divisor_override=1).reshape(new_shape)
                    return last_A.create_similar(patches=new_patches,
                                                 stride=last_A.stride//self.scale_factor[0],
                                                 padding=last_A.padding//self.scale_factor[0],
                                                 )
                else:
                    """
                        The following part is created and mainly maintained by Linyi
                        Time complexity = O(A.numel * scale_factor + outH * kerH + outW * kerW + A.numel * kerH * kerW)
                        With Python loop complexity = O(outH + outW + kerH * kerW * scale_factor^2)
                    """
                    # preparation: unify shape
                    if last_A.padding:
                        padding = unify_shape(last_A.padding)
                    else:
                        padding = (0,0,0,0)
                    # padding = (left, right, top, bottom)
                    if last_A.output_padding:
                        output_padding = unify_shape(last_A.output_padding)
                    else:
                        output_padding = (0,0,0,0)
                    # output_padding = (left, right, top, bottom)

                    """
                        Step 0: filter out valid entries that maps to real cells of input
                        Like with inserted zeros = 2, [x 0 0 x 0 0 x]. Only "x" cells are kept
                        Borrowed from one_d generation from Conv patches
                    """
                    one_d_unfolded_r = create_valid_mask(self.output_shape,
                                                         last_A.patches.device,
                                                         last_A.patches.dtype,
                                                         last_A.patches.shape[-2:],
                                                         last_A.stride,
                                                         last_A.inserted_zeros,
                                                         last_A.padding,
                                                         last_A.output_padding,
                                                         last_A.unstable_idx)
                    patches = last_A.patches * one_d_unfolded_r

                    """
                        Step 1: compute the coordinate mapping from patch coordinates to input coordinates
                        Time complexity: O(outH + outW)
                        note: last_A shape is [outC, batch, outH, outW, inC, kerH, kerW]
                        We create H_idx_map and W_idx_map of shape [outH] and [outW] respectively,
                        recording the start idx of row/column for patches at position [.,.,.,.,.,i,j]
                        in H_idx_map[i] and W_idx_map[j]
                    """
                    ker_size_h, ker_size_w = last_A.shape[-2], last_A.shape[-1]
                    if last_A.unstable_idx is None:
                        # we can get the real output H and W from shape[2] and shape [3]
                        out_h, out_w = last_A.shape[2], last_A.shape[3]
                    else:
                        # it seems to be stored in output_shape
                        out_h, out_w = last_A.output_shape[-2], last_A.output_shape[-1]
                    h_idx_map = torch.arange(0, out_h) * last_A.stride - padding[-2] + output_padding[-2] * last_A.stride
                    h_idx_map = h_idx_map.to(last_A.device)
                    w_idx_map = torch.arange(0, out_w) * last_A.stride - padding[-4] + output_padding[-4] * last_A.stride
                    w_idx_map = w_idx_map.to(last_A.device)

                    r"""
                        Step 2: compute the compressed patches
                        Time complexity: O(outH * kerH + outW * kerW + A.numel * kerH * kerW)
                        Upsampling needs to sum up A cells in scale_factor * scale_factor sub-blocks
                        Example: when scale factor is 2
                        [ a b c d
                          e f g h    ---\    [ a+b+e+f c+d+g+h
                          i j k l    ---/      i+j+m+n k+l+o+p]
                          m n o p]
                        In patches mode, we need to sum up cells in each patch accordingly.
                        The summing mechanism could change at different locations.
                        For each spatial dimension, we create a binary sum_mask tensor [outH, ker_size_h, new_ker_size_h]
                            to select the cells to sum up
                        Example:
                        For [a b c d] -> [a+b c+d], with 3x3 patch covering [0..2] and [2..4].
                        The first patch needs to sum to [a+b c]; the second patch needs to sum to [b c+d]
                        So we have sum_mask
                        [ for patch 1: [[1, 1, 0],    (first entry sums up index 0 and 1)
                                        [0, 0, 1]]^T, (second entry sums up index 2)
                          for patch 2: [[1, 0, 0],    (first entry sums up index 0)
                                        [0, 1, 1]]^T  (second entry sums up index 1 and 2)
                        ]
                        With the mask, we can now compute the new patches with einsum:
                            [outC, batch, outH, outW, inC, kerH, kerW] * [outH, kerH, new_kerH] -> [outC, batch, outH, outW, inC, new_kerH, kerW]
                    """
                    tot_scale_fac = ((last_A.inserted_zeros + 1) * self.scale_factor[0], (last_A.inserted_zeros + 1) * self.scale_factor[1])
                    new_ker_size_h, new_ker_size_w = \
                        (tot_scale_fac[0] + ker_size_h - 2) // tot_scale_fac[0] + 1, \
                        (tot_scale_fac[1] + ker_size_w - 2) // tot_scale_fac[1] + 1

                    min_h_idx, max_h_idx = h_idx_map[0], h_idx_map[-1] + ker_size_h
                    shrank_h_idx = (torch.arange(min_h_idx, max_h_idx) + last_A.inserted_zeros).div(tot_scale_fac[0], rounding_mode='floor')
                    if last_A.unstable_idx is None:
                        # with nonsparse index, create full-sized sum musk for rows
                        ker_h_indexer = torch.arange(0, ker_size_h).to(last_A.device)
                        sum_mask_h = torch.zeros(last_A.shape[2], ker_size_h, new_ker_size_h).to(last_A.device)
                        for i in range(last_A.shape[2]):
                            sum_mask_h[i, ker_h_indexer, \
                                shrank_h_idx[h_idx_map[i] - min_h_idx: h_idx_map[i] - min_h_idx + ker_size_h] - shrank_h_idx[h_idx_map[i] - min_h_idx]] = 1
                            # set zero to those in padding area
                            padding_place_mask = (ker_h_indexer + h_idx_map[i] < 0)
                            sum_mask_h[i, padding_place_mask] = 0
                    else:
                        # with sparse index, create sparse sum musk
                        sum_mask_h = torch.zeros(last_A.shape[0], ker_size_h, new_ker_size_h).to(last_A.device)

                        row_nos = last_A.unstable_idx[1]
                        unstable_loc_indexer = torch.arange(0, row_nos.shape[0]).to(last_A.device)

                        for k in range(ker_size_h):
                            place_in_new_ker = shrank_h_idx[h_idx_map[row_nos] - min_h_idx + k] - shrank_h_idx[h_idx_map[row_nos] - min_h_idx]
                            sum_mask_h[unstable_loc_indexer, k, place_in_new_ker] = 1
                            # set zero to those in padding area
                            padding_place_mask = (h_idx_map[row_nos] + k < 0)
                            sum_mask_h[padding_place_mask, k] = 0

                    min_w_idx, max_w_idx = w_idx_map[0], w_idx_map[-1] + ker_size_w
                    shrank_w_idx = (torch.arange(min_w_idx, max_w_idx) + last_A.inserted_zeros).div(tot_scale_fac[1], rounding_mode='floor')
                    if last_A.unstable_idx is None:
                        # with nonsparse index, create full-sized sum musk for columns
                        ker_w_indexer = torch.arange(0, ker_size_w).to(last_A.device)
                        sum_mask_w = torch.zeros(last_A.shape[3], ker_size_w, new_ker_size_w).to(last_A.device)
                        for i in range(last_A.shape[3]):
                            sum_mask_w[i, ker_w_indexer, \
                                shrank_w_idx[w_idx_map[i] - min_w_idx: w_idx_map[i] - min_w_idx + ker_size_w] - shrank_w_idx[w_idx_map[i] - min_w_idx]] = 1
                            # set zero to those in padding area
                            padding_place_mask = (ker_w_indexer + w_idx_map[i] < 0)
                            sum_mask_w[i, padding_place_mask] = 0
                    else:
                        # with sparse index, create sparse sum musk
                        sum_mask_w = torch.zeros(last_A.shape[0], ker_size_w, new_ker_size_w).to(last_A.device)

                        col_nos = last_A.unstable_idx[2]
                        unstable_loc_indexer = torch.arange(0, col_nos.shape[0]).to(last_A.device)

                        for k in range(ker_size_w):
                            place_in_new_ker = shrank_w_idx[w_idx_map[col_nos] - min_w_idx + k] - shrank_w_idx[w_idx_map[col_nos] - min_w_idx]
                            sum_mask_w[unstable_loc_indexer, k, place_in_new_ker] = 1
                            # set zero to those in padding area
                            padding_place_mask = (w_idx_map[col_nos] + k < 0)
                            sum_mask_w[padding_place_mask, k] = 0

                    if last_A.unstable_idx is None:
                        # nonsparse aggregation
                        new_patches = torch.einsum("ObhwIij,hix,wjy->ObhwIxy", patches, sum_mask_h, sum_mask_w)
                    else:
                        # sparse aggregation
                        new_patches = torch.einsum("NbIij,Nix,Njy->NbIxy", patches, sum_mask_h, sum_mask_w)

                    """
                        Step 3: broadcasting the new_patches by repeating elements,
                            since later we would need to apply insert_zeros
                        For example, scale_factor = 3, repeat patch [a,b] to [a,a,a,b,b,b]
                        Time complexity: O(A.numel * scale_factor)
                    """
                    ext_new_ker_size_h, ext_new_ker_size_w = \
                        new_ker_size_h * tot_scale_fac[0], new_ker_size_w * tot_scale_fac[1]
                    ext_new_patches = torch.zeros(list(new_patches.shape[:-2]) +
                                                  [ext_new_ker_size_h, ext_new_ker_size_w], device=new_patches.device)
                    for i in range(ext_new_ker_size_h):
                        for j in range(ext_new_ker_size_w):
                            ext_new_patches[..., i, j] = new_patches[..., i // tot_scale_fac[0], j // tot_scale_fac[1]]

                    """
                        Step 4: compute new padding, stride, shape, insert_zeros, and output_padding
                    """
                    # stride should be the same after upsampling, stride is an integer
                    # new_stride = last_A.stride
                    # padding can change much, the beginning should extend by (scale - 1) entries,
                    # the ending should extend by (ext_new_ker_size - ker_size) entries
                    # padding = (left, right, top, bottom)
                    new_padding = (padding[0] + (self.scale_factor[1] - 1) * (last_A.inserted_zeros + 1),
                                   padding[1] + ext_new_ker_size_w - ker_size_w,
                                   padding[2] + (self.scale_factor[0] - 1) * (last_A.inserted_zeros + 1),
                                   padding[3] + ext_new_ker_size_h - ker_size_h)
                    if new_padding[0] == new_padding[1] and new_padding[1] == new_padding[2] and new_padding[2] == new_padding[3]:
                        # simplify to an int
                        new_padding = new_padding[0]
                    # only support uniform scaling on H and W now, i.e., self.scale_factor[0] == self.scale_factor[1]
                    inserted_zeros = tot_scale_fac[0] - 1
                    # output padding seems not to change
                    # new_output_padding = last_A.output_padding

                    """
                        Package and create
                    """
                    # sparse tensor doesn't support einsum which is necessary for subsequent computes, so deprecated
                    # if inserted_zeros >= 3:
                    #     # mask unused cells
                    #     input_shape = list(self.output_shape)
                    #     input_shape[-2], input_shape[-1] = input_shape[-2] // self.scale_factor[-2], \
                    #         input_shape[-1] // self.scale_factor[-1]
                    #     one_unfolded = create_valid_mask(input_shape, ext_new_patches.device,
                    #                                       ext_new_patches.dtype, ext_new_patches.shape[-2:],
                    #                                       last_A.stride, inserted_zeros, new_padding,
                    #                                       last_A.output_padding,
                    #                                       last_A.unstable_idx if last_A.unstable_idx else None)
                    #     ext_new_patches = (ext_new_patches * one_unfolded).to_sparse()

                    # print the shape change after upsampling, if needed
                    # print(f'After upsampling, '
                    #       f'{last_A.patches.shape} (pad={padding}, iz={last_A.inserted_zeros}, s={last_A.stride}) -> '
                    #       f'{ext_new_patches.shape} (pad={new_padding}, iz={inserted_zeros}, s={last_A.stride})')
                    ret_patches_A = last_A.create_similar(patches=ext_new_patches,
                                                          padding=new_padding,
                                                          inserted_zeros=inserted_zeros)
                    if self.input_shape[-2] < ret_patches_A.shape[-2] and self.input_shape[-1] < ret_patches_A.shape[-2] \
                            and not is_shape_used(ret_patches_A.output_padding):
                        # using matrix mode could be more memory efficient
                        ret_matrix_A = ret_patches_A.to_matrix(self.input_shape)
                        # print(f'After upsampling, to_matrix: {ret_matrix_A.shape}')
                        ret_matrix_A = ret_matrix_A.transpose(0, 1)
                        return ret_matrix_A
                    else:
                        return ret_patches_A

        last_lA = _bound_oneside(last_lA)
        last_uA = _bound_oneside(last_uA)
        return [(last_lA, last_uA), (None, None), (None, None)], 0, 0


================================================
FILE: auto_LiRPA/operators/rnn.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
"""RNN."""
from .base import *


class BoundRNN(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.complex = True
        self.output_index = output_index
        raise NotImplementedError(
            'torch.nn.RNN is not supported at this time.'
            'Please implement your RNN with torch.nn.RNNCell and a manual for-loop.'
            'See an example of LSTM:'
            'https://github.com/Verified-Intelligence/auto_LiRPA/blob/10a9b30/examples/sequence/lstm.py#L9')

    def forward(self, x, weight_input, weight_recurrent, bias, sequence_length, initial_h):
        assert (torch.sum(torch.abs(initial_h)) == 0)

        self.input_size = x.shape[-1]
        self.hidden_size = weight_input.shape[-2]

        class BoundRNNImpl(nn.Module):
            def __init__(self, input_size, hidden_size,
                         weight_input, weight_recurrent, bias, output_index):
                super().__init__()

                self.input_size = input_size
                self.hidden_size = hidden_size

                self.cell = torch.nn.RNNCell(
                    input_size=input_size,
                    hidden_size=hidden_size
                )

                self.cell.weight_ih.data.copy_(weight_input.squeeze(0).data)
                self.cell.weight_hh.data.copy_(weight_recurrent.squeeze(0).data)
                self.cell.bias_ih.data.copy_((bias.squeeze(0))[:hidden_size].data)
                self.cell.bias_hh.data.copy_((bias.squeeze(0))[hidden_size:].data)

                self.output_index = output_index

            def forward(self, x, hidden):
                length = x.shape[0]
                outputs = []
                for i in range(length):
                    hidden = self.cell(x[i, :], hidden)
                    outputs.append(hidden.unsqueeze(0))
                outputs = torch.cat(outputs, dim=0)

                if self.output_index == 0:
                    return outputs
                else:
                    return hidden

        self.model = BoundRNNImpl(
            self.input_size, self.hidden_size,
            weight_input, weight_recurrent, bias,
            self.output_index)
        self.input = (x, initial_h)

        return self.model(*self.input)

================================================
FILE: auto_LiRPA/operators/s_shaped.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
"""S-shaped base class, activation functions, and relevant ops."""
import torch
from torch.nn import Module
from torch.autograd import Function
from .base import *
from .activation_base import BoundOptimizableActivation


class BoundSShaped(BoundOptimizableActivation):
    """
    Base class for computing output bounds of globally and partially s-shaped nonlinear functions
    (e.g., sigmoid, tanh, sin, cos) over given input intervals.
    """
    def __init__(self, attr=None, inputs=None, output_index=0, options=None, activation=(None, None, None), precompute=False):
        super().__init__(attr, inputs, output_index, options)
        if options is None:
            options = {}
        self.splittable = True
        self.inverse_s_shape = False
        self.ibp_intermediate = True

        self.activation = activation
        self.activation_name = activation[0]

        self.act_func = activation[1]
        self.d_act_func = activation[2]

        self.step_pre = 0.01
        if precompute:
            self.precompute_relaxation(self.act_func, self.d_act_func)
            self.precompute_dfunc_values(self.act_func, self.d_act_func)
        # TODO make them configurable when implementing a general nonlinear activation.
        # Neurons whose gap between pre-activation bounds is smaller than this
        # threshold will be masked and don't need branching.
        self.split_min_gap = 1e-2  # 1e-4
        # Neurons whose pre-activation bounds don't overlap with this range
        # are considered as stable (with values either 0 or 1) and don't need
        # branching.
        self.split_range = (self.range_l, self.range_u)
        # The initialization will be adjusted if the pre-activation bounds are too loose.
        self.loose_threshold = options.get(self.activation_name, {}).get(
            'loose_threshold', None)
        self.convex_concave = None
        self.activation_bound_option = options.get('activation_bound_option', 'adaptive')

        self.inflections = [0.]
        self.extremes = []
        self.sigmoid_like_mask = None

        # FIXME: Smoothness enhancement for s-shaped functions should be enabled by default.
        # This enhancement makes the linear bounds change smoothly between different cases.
        # We provide this option only to reproduce results from previous papers.
        self.disable_smoothness_enhancement = options.get(
            's_shaped_disable_smoothness_enhancement', False)

    def opt_init(self):
        super().opt_init()
        self.tp_both_lower_init = {}
        self.tp_both_upper_init = {}

    def branch_input_domain(self, lb, ub):
        # For functions that are only partially s-shaped, such as sin and cos, the non-s-shaped intervals are identified
        # and masked here. sigmoid_like_mask marks the strictly s-shaped intervals, and branch_mask marks the non-s-
        # shaped ones. For globally s-shaped functions like tanh and sigmoid, sigmoid_like_mask stores all 1s and
        # branch_mask stores all 0s.
        self.sigmoid_like_mask = torch.ones_like(lb, dtype=torch.bool)
        self.branch_mask = torch.zeros_like(lb, dtype=torch.bool)

    def _init_opt_parameters_impl(self, size_spec, name_start, num_params=10):
        """Implementation of init_opt_parameters for each start_node."""
        l, u = self.inputs[0].lower, self.inputs[0].upper
        shape = l.shape
        # Alpha dimension is (num_params, output_shape, batch, *shape) for the s-shaped activation function.
        alpha = torch.empty(num_params, size_spec, *shape, device=l.device)
        alpha.data[:4] = (l + u) / 2
        alpha.data[4:6] = self.tp_both_lower_init[name_start]
        alpha.data[6:8] = self.tp_both_upper_init[name_start]
        if num_params > 8:
            alpha.data[8:] = 0
        return alpha

    @torch.no_grad()
    def precompute_relaxation(self, func, dfunc, x_limit=500):
        """
        This function precomputes the tangent lines that will be used as
        lower/upper bounds for S-shaped functions centered at 0 along the x-axis.
        """
        self.x_limit = x_limit
        self.num_points_pre = int(self.x_limit / self.step_pre)
        max_iter = 100

        logger.debug('Precomputing relaxation for %s (pre-activation limit: %f)',
                     self.__class__.__name__, x_limit)

        def check_lower(upper, d):
            """Given two points upper, d (d <= upper),
            check if the slope at d will be less than f(upper) at upper."""
            k = dfunc(d)
            # Return True if the slope is a lower bound.
            return k * (upper - d) + func(d) <= func(upper)

        def check_upper(lower, d):
            """Given two points lower, d (d >= lower),
            check if the slope at d will be greater than f(lower) at lower."""
            k = dfunc(d)
            # Return True if the slope is a upper bound.
            return k * (lower - d) + func(d) >= func(lower)

        # Given an upper bound point (>=0), find a line that is guaranteed to be a lower bound of this function.
        upper = self.step_pre * torch.arange(0, self.num_points_pre + 5, device=self.device)
        r = torch.zeros_like(upper)
        # Initial guess, the tangent line is at -1.
        l = -torch.ones_like(upper)
        while True:
            # Check if the tangent line at the guessed point is an lower bound at f(upper).
            checked = check_lower(upper, l).int()
            # If the initial guess is not smaller enough, then double it (-2, -4, etc).
            l = checked * l + (1 - checked) * (l * 2)
            if checked.sum() == l.numel():
                break
        # Now we have starting point at l, its tangent line is guaranteed to be an lower bound at f(upper).
        # We want to further tighten this bound by moving it closer to 0.
        for _ in range(max_iter):
            # Binary search.
            m = (l + r) / 2
            checked = check_lower(upper, m).int()
            l = checked * m + (1 - checked) * l
            r = checked * r + (1 - checked) * m
        # At upper, a line with slope l is guaranteed to lower bound the function.
        self.d_lower = l.clone()

        # Do the same again:
        # Given an lower bound point (<=0), find a line that is guaranteed to be an upper bound of this function.
        lower = -self.step_pre * torch.arange(0, self.num_points_pre + 5, device=self.device)
        l = torch.zeros_like(upper)
        r = torch.ones_like(upper)
        while True:
            checked = check_upper(lower, r).int()
            r = checked * r + (1 - checked) * (r * 2)
            if checked.sum() == l.numel():
                break
        for _ in range(max_iter):
            m = (l + r) / 2
            checked = check_upper(lower, m).int()
            l = (1 - checked) * m + checked * l
            r = (1 - checked) * r + checked * m
        self.d_upper = r.clone()

        logger.debug('Done')

    def precompute_dfunc_values(self, func, dfunc, x_limit=500):
        """
        This function precomputes a list of values for dfunc.
        """
        upper = self.step_pre * torch.arange(0, self.num_points_pre + 5, device=self.device)
        self.dfunc_values = dfunc(upper)

    def forward(self, x):
        return self.act_func(x)

    def retrieve_from_precompute(self, precomputed_d, input_bound, default_d):
        """
        precomputed_d: The precomputed tangent points.
        input_bound: The input bound of the function.
        default_d: If input bound goes out of precompute range, we will use default_d.
        All of the inputs should share the same shape.
        """

        # divide input bound into number of steps to the inflection point (at x=0)
        index = torch.max(
            torch.zeros(input_bound.numel(), dtype=torch.long, device=input_bound.device),
            (input_bound / self.step_pre).to(torch.long).reshape(-1)
        ) + 1
        # If precompute range is smaller than input, tangent points will be taken from default.
        # The default value should be a guaranteed bound
        if index.max() >= precomputed_d.numel():
            warnings.warn(f'Pre-activation bounds are too loose for {self}')
            return torch.where(
                (index < precomputed_d.numel()).view(input_bound.shape),
                torch.index_select(
                    precomputed_d, 0, index.clamp(max=precomputed_d.numel() - 1)
                ).view(input_bound.shape),
                default_d,
            ).view(input_bound.shape)
        else:
            return torch.index_select(precomputed_d, 0, index).view(input_bound.shape)

    def generate_d_lower_upper(self, lower, upper):
        # Indices of neurons with input upper bound >=0, whose optimal slope to
        # lower bound the function was pre-computed.
        # Note that for neurons with also input lower bound >=0,
        # they will be masked later.
        d_lower = self.retrieve_from_precompute(self.d_lower, upper, lower)

        # Indices of neurons with lower bound <=0, whose optimal slope to upper
        # bound the function was pre-computed.
        d_upper = self.retrieve_from_precompute(self.d_upper, -lower, upper)
        return d_lower, d_upper

    def retrieve_d_from_k(self, k, func):
        d_indices = torch.searchsorted(torch.flip(self.dfunc_values, [0]), k, right=False)
        d_indices = self.num_points_pre - d_indices + 4
        d_left = d_indices * self.step_pre
        d_right = d_left + self.step_pre
        y_left = func(d_left)
        y_right = func(d_right)
        k_left = self.dfunc_values[d_indices]
        k_right = self.dfunc_values[torch.clamp(d_indices+1, max=self.dfunc_values.shape[0]-1)]
        # We choose the intersection of two tangent lines
        d_return = (k_left * d_left - k_right * d_right - y_left + y_right) / (k_left - k_right).clamp(min=1e-8)
        mask_almost_the_same = abs(k_left - k_right) < 1e-5
        d_return[mask_almost_the_same] = d_left[mask_almost_the_same]
        y_d = k_left * (d_return - d_left) + y_left
        return d_return, y_d

    def bound_relax_impl_same_slope(self, x, func, dfunc):
        lower, upper = x.lower, x.upper
        y_l, y_u = func(lower), func(upper)
        # k_direct is the slope of the line directly connect (lower, func(lower)), (upper, func(upper)).
        k_direct = k = (y_u - y_l) / (upper - lower).clamp(min=1e-8)
        mask_almost_the_same = abs(upper - lower) < 1e-4
        k_direct[mask_almost_the_same] = dfunc(lower)[mask_almost_the_same]

        mask_direct_lower = k_direct <= dfunc(lower)
        mask_direct_upper = k_direct <= dfunc(upper)

        # We now find the tangent line with the same slope of k_direct
        # In the case of "mask_direct_lower(or upper)", there should be only one possible tangent point
        # at which we obtain the same slope within the interval [lower, upper]
        d, y_d = self.retrieve_d_from_k(k_direct, func)
        d[lower + upper < 0] *= -1  # This is the case "direct upper"
        y_d[lower + upper < 0] = 2 * func(torch.tensor(0)) - y_d[lower + upper < 0]
        d_clamped = torch.clamp(d, min=lower, max=upper)
        y_d[d_clamped != d] = func(d_clamped[d_clamped != d])
        self.add_linear_relaxation(
            mask=mask_direct_lower, type='lower', k=k_direct, x0=lower, y0=y_l
        )
        self.add_linear_relaxation(
            mask=mask_direct_lower, type='upper', k=k_direct, x0=d_clamped, y0=y_d
        )
        self.add_linear_relaxation(
            mask=mask_direct_upper, type='upper', k=k_direct, x0=upper, y0=y_u
        )
        self.add_linear_relaxation(
            mask=mask_direct_upper, type='lower', k=k_direct, x0=d_clamped, y0=y_d
        )
        # Now we turn to the case where no direct line can be used
        d_lower, d_upper = self.generate_d_lower_upper(lower, upper)
        mask_both = torch.logical_not(mask_direct_upper + mask_direct_lower)
        # To make sure upper and lower bounds have the same slope,
        # we need the two tangents to be symmetrical
        d_same_slope = torch.max(torch.abs(d_lower), torch.abs(d_upper))
        k = dfunc(d_same_slope)
        y_d_same_slope = func(d_same_slope)
        y_d_same_slope_opposite = 2*func(torch.tensor(0)) - y_d_same_slope
        self.add_linear_relaxation(
            mask=mask_both, type='upper', k=k, x0=d_same_slope, y0=y_d_same_slope
        )
        self.add_linear_relaxation(
            mask=mask_both, type='lower', k=k, x0=-d_same_slope, y0=y_d_same_slope_opposite
        )

    def bound_relax_impl(self, x, func, dfunc):
        lower, upper = x.lower, x.upper
        y_l, y_u = func(lower), func(upper)
        # k_direct is the slope of the line directly connecting the two endpoints of the function inside the interval:
        # (lower, func(lower)) and (upper, func(upper)).
        k_direct = k = (y_u - y_l) / (upper - lower).clamp(min=1e-8)

        # Fixed bounds that cannot be optimized.
        # self.mask_neg are the masks for neurons with upper bound <= 0, i.e., the whole input interval lies below 0.
        # self.mask_pos are the masks for neurons with lower bound >= 0, i.e., the whole input interval lies above 0.
        # For negative intervals, we can derive the linear upper bound by connecting the two endpoints,
        # i.e., starting from (lower, func(lower)) and setting the slope to k_direct.
        self.add_linear_relaxation(
            mask=self.mask_neg, type='upper', k=k_direct, x0=lower, y0=y_l)
        # For positive intervals, we connect the two endpoints to find the linear lower bound instead.
        self.add_linear_relaxation(
            mask=self.mask_pos, type='lower', k=k_direct, x0=lower, y0=y_l)

        # Store the x-coordinates of the points of tangencies.
        # d_lower is the closest value to upper such that the tangent line at (d_lower, func(d_lower)) still lower-
        # bounds the function in interval (lower, upper).
        # d_upper is the closest value to lower such that the tangent line at (d_lower, func(d_lower)) still upper-
        # bounds the function in interval (lower, upper).
        # d_lower and d_upper can be regarded as the default points of tangencies to draw linear bounds through.
        d_lower, d_upper = self.generate_d_lower_upper(lower, upper)

        # self.mask_both is the masks for neurons where lower < 0 < upper, i.e., the input interval contains 0.
        # mask_direct_lower is the masks for neurons whose input interval contains zero and whose linear lower bound can
        # be derived by connecting the two endpoints.
        # mask_direct_upper is the masks for neurons whose input interval contains zero and whose linear upper bound can
        # be derived by connecting the two endpoints.
        if self.convex_concave is None:
            mask_direct_lower = k_direct < dfunc(lower)
            mask_direct_upper = k_direct < dfunc(upper)
        else:
            mask_direct_lower = torch.where(
                self.convex_concave,
                k_direct < dfunc(lower), k_direct > dfunc(upper))
            mask_direct_upper = torch.where(
                self.convex_concave,
                k_direct < dfunc(upper), k_direct > dfunc(lower))
        mask_direct_lower = torch.logical_and(mask_direct_lower, self.mask_both)
        mask_direct_upper = torch.logical_and(mask_direct_upper, self.mask_both)

        if self.opt_stage in ['opt', 'reuse']:
            if not hasattr(self, 'alpha'):
                # Raise an error if alpha is not created.
                self._no_bound_parameters()
            ns = self._start

            # Clamping is done here rather than after `opt.step()` call
            # because it depends on pre-activation bounds
            self.alpha[ns].data[0:2] = torch.max(
                torch.min(self.alpha[ns][0:2], upper), lower)
            self.alpha[ns].data[2:4] = torch.max(
                torch.min(self.alpha[ns][2:4], upper), lower)
            if self.convex_concave is None:
                self.alpha[ns].data[4:6] = torch.min(
                    self.alpha[ns][4:6], d_lower)
                self.alpha[ns].data[6:8] = torch.max(
                    self.alpha[ns][6:8], d_upper)
            else:
                self.alpha[ns].data[4:6, :] = torch.where(
                    self.convex_concave,
                    torch.max(lower, torch.min(self.alpha[ns][4:6, :], d_lower)),
                    torch.min(upper, torch.max(self.alpha[ns][4:6, :], d_lower))
                )
                self.alpha[ns].data[6:8, :] = torch.where(
                    self.convex_concave,
                    torch.min(upper, torch.max(self.alpha[ns][6:8, :], d_upper)),
                    torch.max(lower, torch.min(self.alpha[ns][6:8, :], d_upper))
                )

            # shape [2, out_c, n, c, h, w].
            tp_pos = self.alpha[ns][0:2]  # For upper bound relaxation
            tp_neg = self.alpha[ns][2:4]  # For lower bound relaxation
            tp_both_lower = self.alpha[ns][4:6]
            tp_both_upper = self.alpha[ns][6:8]

            # No need to use tangent line, when the tangent point is at the left
            # side of the preactivation lower bound. Simply connect the two sides.
            self.add_linear_relaxation(
                mask=mask_direct_lower, type='lower', k=k_direct, x0=lower, y0=y_l)
            self.add_linear_relaxation(
                mask=torch.logical_xor(self.mask_both, mask_direct_lower), type='lower',
                k=dfunc(tp_both_lower), x0=tp_both_lower, y0=func(tp_both_lower))

            self.add_linear_relaxation(
                mask=mask_direct_upper, type='upper', k=k_direct, x0=lower, y0=y_l)
            self.add_linear_relaxation(
                mask=torch.logical_xor(self.mask_both, mask_direct_upper), type='upper',
                k=dfunc(tp_both_upper), x0=tp_both_upper, y0=func(tp_both_upper))

            self.add_linear_relaxation(
                mask=self.mask_neg, type='lower', k=dfunc(tp_neg),
                x0=tp_neg, y0=func(tp_neg))
            self.add_linear_relaxation(
                mask=self.mask_pos, type='upper', k=dfunc(tp_pos),
                x0=tp_pos, y0=func(tp_pos))
        else:
            if self.opt_stage == 'init':
                # Initialize optimizable slope.
                tp_both_lower_init = d_lower.detach()
                tp_both_upper_init = d_upper.detach()

                if self.loose_threshold is not None:
                    # We will modify d_lower and d_upper inplace.
                    # So make a copy for these two.
                    tp_both_lower_init = tp_both_lower_init.clone()
                    tp_both_upper_init = tp_both_upper_init.clone()
                    # A different initialization if the pre-activation bounds
                    # are too loose
                    loose = torch.logical_or(lower < -self.loose_threshold,
                                            upper > self.loose_threshold)
                    d_lower[loose] = lower[loose]
                    d_upper[loose] = upper[loose]

                ns = self._start
                self.tp_both_lower_init[ns] = tp_both_lower_init
                self.tp_both_upper_init[ns] = tp_both_upper_init

            # Not optimized (vanilla CROWN bound).
            # Use the middle point slope as the lower/upper bound. Not optimized.
            m = (lower + upper) / 2
            y_m = func(m)
            k_m = dfunc(m)
            # Lower bound is the middle point slope for the case input upper bound <= 0.
            # Note that the upper bound in this case is the direct line between (lower, func(lower)) and (upper, func(upper)).
            self.add_linear_relaxation(mask=self.mask_neg, type='lower', k=k_m, x0=m, y0=y_m)
            # Upper bound is the middle point slope for the case input lower bound >= 0.
            # Note that the lower bound in this case is the direct line between (lower, func(lower)) and (upper, func(upper)).
            self.add_linear_relaxation(mask=self.mask_pos, type='upper', k=k_m, x0=m, y0=y_m)
            # Now handle the case where input lower bound <=0 and upper bound >= 0.
            # A tangent line starting at d_lower is guaranteed to be a lower bound given the input upper bound.
            k = dfunc(d_lower)
            # Another possibility is to use the direct line as the lower bound, when this direct line does not intersect with f.
            # This is only valid when the slope at the input lower bound has a slope greater than the direct line.
            self.add_linear_relaxation(mask=mask_direct_lower, type='lower', k=k_direct, x0=lower, y0=y_l)
            # Otherwise (i.e., when the input interval cross zero and mask_direct_lower is not true),
            # we do not use the direct line, we use the d_lower slope.
            self.add_linear_relaxation(
                mask=torch.logical_xor(self.mask_both, mask_direct_lower),
                type='lower', k=k, x0=d_lower, y0=func(d_lower))
            # Do the same for the upper bound side when input lower bound <=0 and upper bound >= 0.
            k = dfunc(d_upper)
            self.add_linear_relaxation(
                mask=mask_direct_upper, type='upper', k=k_direct, x0=lower, y0=y_l)
            self.add_linear_relaxation(
                mask=torch.logical_xor(self.mask_both, mask_direct_upper),
                type='upper', k=k, x0=d_upper, y0=func(d_upper))

            if self.disable_smoothness_enhancement:
                return
            # Partially modify the linear bound computation for intervals that contains 0 so that the linear bound
            # changes smoothly w.r.t to the input bounds. For example, when we fix the input lower bound and drag the
            # input upper bound, we do not expect the linear bound to change abruptly at any point.
            # Therefore, under certain conditions, we do not use the above heuristics. Instead, we draw a tangent line
            # through the middle point (m, func(m)) where m = (lower + upper) / 2 and use it as a linear bound.
            if self.inverse_s_shape:
                # When the function has an inverse s-shape (such as pow3), we switch to drawing a tangent line through
                # the middle point as the lower bound when the default point of tangency is on the left of the middle
                # point. Otherwise, the lower bound will be too loose on the side of the input upper bound. The change
                # will make the bound on the other side a little bit looser as a tradeoff for overall tightness.
                self.add_linear_relaxation(
                    mask=torch.logical_and(self.mask_both, d_lower < m),
                    type='lower', k=k_m, x0=m, y0=y_m)
                # We make a similar change to the linear upper bound when the default point of tangency is on
                # the right of the middle point.
                self.add_linear_relaxation(
                    mask=torch.logical_and(self.mask_both, d_upper >= m),
                    type='upper', k=k_m, x0=m, y0=y_m)
            elif self.sigmoid_like_mask is not None:
                # self.sigmoid_like_mask is originally defined for periodic functions like sin and cos. It marks
                # intervals on the s-shaped or flipped-s-shaped parts of the function. Whether the part is flipped-s-
                # shaped is determined by comparing func(lower) and func(upper). Currently, some overall s-shaped
                # function, such as tanh and sigmoid, also has this mask. In the future, we will make it default for
                # both completely and partially s-shaped functions to reduce branching in the code.
                y_l = func(lower)
                y_u = func(upper)
                # If the input interval is on the s-shaped part of the function, we switch to drawing a tangent line
                # through the middle point as the lower bound when the default point of tangency is on the right of the
                # middle point.
                self.add_linear_relaxation(
                    mask=torch.logical_and(torch.logical_and(self.sigmoid_like_mask, y_l < y_u), d_lower >= m),
                    type='lower', k=k_m, x0=m, y0=y_m)
                # We switch to drawing a tangent line through the middle point as the upper bound when the default point
                # of tangency is on the left of the middle point.
                self.add_linear_relaxation(
                    mask=torch.logical_and(torch.logical_and(self.sigmoid_like_mask, y_l < y_u), d_upper < m),
                    type='upper', k=k_m, x0=m, y0=y_m)
                # If the input interval is on the flipped-s-shaped part of the function, we flip the condition as well
                # as whether we change the lower or upper bound.
                self.add_linear_relaxation(
                    mask=torch.logical_and(torch.logical_and(self.sigmoid_like_mask, y_l >= y_u), d_lower < m),
                    type='lower', k=k_m, x0=m, y0=y_m)
                self.add_linear_relaxation(
                    mask=torch.logical_and(torch.logical_and(self.sigmoid_like_mask, y_l >= y_u), d_upper >= m),
                    type='upper', k=k_m, x0=m, y0=y_m)
            else:
                # Handle simple cases where the function has the most common s shape. Now it serves as a safeguard
                # against any child operator class whose self.sigmoid_like_mask is uninitialized. Here self.mask_both is
                # equivalent to self.sigmoid_like_mask & (y_l < y_u) in the case above.
                self.add_linear_relaxation(
                    mask=torch.logical_and(self.mask_both, d_lower >= m),
                    type='lower', k=k_m, x0=m, y0=y_m)
                self.add_linear_relaxation(
                    mask=torch.logical_and(self.mask_both, d_upper < m),
                    type='upper', k=k_m, x0=m, y0=y_m)

    def bound_relax_branch(self, lb, ub):
        # For functions that are only partially s-shaped, such as sin and cos, the non-s-shaped intervals are re-bounded
        # here. This method returns the linear bound coefficients (lower_slope, lower_bias, upper_slope, upper_bias) of
        # the non-s-shaped intervals. For globally s-shaped functions like tanh and sigmoid, the method returns 0s.
        return 0., 0., 0., 0.

    def bound_relax(self, x, init=False, dim_opt=None):
        if init:
            self.init_linear_relaxation(x, dim_opt)
        lb = x.lower
        ub = x.upper
        self.branch_input_domain(lb, ub)
        if self.activation_bound_option == 'same-slope':
            self.bound_relax_impl_same_slope(x, self.act_func, self.d_act_func)
        else:
            self.bound_relax_impl(x, self.act_func, self.d_act_func)
        lower_slope, lower_bias, upper_slope, upper_bias = self.bound_relax_branch(lb, ub)
        self.lw = self.lw * self.sigmoid_like_mask + self.branch_mask * lower_slope
        self.lb = self.lb * self.sigmoid_like_mask + self.branch_mask * lower_bias
        self.uw = self.uw * self.sigmoid_like_mask + self.branch_mask * upper_slope
        self.ub = self.ub * self.sigmoid_like_mask + self.branch_mask * upper_bias

    def get_split_mask(self, lower, upper, input_index):
        assert input_index == 0
        return torch.logical_and(
            upper - lower >= self.split_min_gap,
            torch.logical_or(upper >= self.split_range[0],
                             lower <= self.split_range[1])
        )

class BoundPow(BoundSShaped):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        self.exponent = 2
        super().__init__(attr, inputs, output_index, options)
        self.ibp_intermediate = False
        self.has_constraint = True

        def act_func(x):
            return torch.pow(x, self.exponent)
        self.act_func = act_func
        def d_act_func(x):
            return self.exponent * torch.pow(x, self.exponent - 1)
        self.d_act_func = d_act_func
        def d2_act_func(x):
            return self.exponent * (self.exponent - 1) * torch.pow(x, self.exponent - 2)
        self.d2_act_func = d2_act_func

    def generate_d_lower_upper(self, lower, upper):
        if self.exponent % 2:
            # Indices of neurons with input upper bound >=0,
            # whose optimal slope to lower bound the function was pre-computed.
            # Note that for neurons with also input lower bound >=0, they will be masked later.
            d_upper = self.retrieve_from_precompute(self.d_upper, upper, lower)

            # Indices of neurons with lower bound <=0,
            # whose optimal slope to upper bound the function was pre-computed.
            d_lower = self.retrieve_from_precompute(self.d_lower, -lower, upper)
            return d_lower, d_upper
        else:
            return torch.zeros_like(upper), torch.zeros_like(upper)

    def branch_input_domain(self, lb, ub):
        lower = lb
        upper = ub
        num_inflection = torch.zeros_like(lower)
        inflection_mat = lower
        for inflection in self.inflections:
            num_inflection += torch.logical_and(
                lower <= inflection, upper >= inflection)
            inflection_mat = torch.where(
                torch.logical_and(lower <= inflection, upper >= inflection),
                torch.tensor(inflection, device=lb.device), inflection_mat)
        inflection_mask = num_inflection <= 1.

        extreme_mask = torch.ones_like(lower)
        for extreme in self.extremes:
            extreme_mask *= torch.logical_or(lower >= extreme, upper <= extreme)

        self.sigmoid_like_mask = torch.logical_and(inflection_mask, extreme_mask)
        self.branch_mask = torch.logical_xor(torch.ones_like(lower), self.sigmoid_like_mask)
        self.inflection_mat = torch.where(self.sigmoid_like_mask, inflection_mat, lower)

        self.mask_neg = torch.logical_and((self.d2_act_func(lower) >= 0),
            torch.logical_and((self.d2_act_func(upper) >= 0),
            self.sigmoid_like_mask))
        self.mask_pos = torch.logical_and((self.d2_act_func(lower) < 0),
            torch.logical_and((self.d2_act_func(upper) < 0),
            self.sigmoid_like_mask))
        self.mask_both = torch.logical_xor(self.sigmoid_like_mask,
            torch.logical_or(self.mask_neg, self.mask_pos))
        self.convex_concave = self.d2_act_func(lower) >= 0

    @torch.no_grad()
    def precompute_relaxation(self, func, dfunc, x_limit = 500):
        """
        This function precomputes the tangent lines that will be used as
        lower/upper bounds for S-shapes functions.
        """
        self.x_limit = x_limit
        self.num_points_pre = int(self.x_limit / self.step_pre)

        max_iter = 100

        def check_lower(upper, d):
            """Given two points upper, d (d <= upper), check if the slope at d
            will be less than f(upper) at upper."""
            k = dfunc(d)
            # Return True if the slope is a lower bound.
            return k * (upper - d) + func(d) <= func(upper)

        def check_upper(lower, d):
            """Given two points lower, d (d >= lower), check if the slope at d
            will be greater than f(lower) at lower."""
            k = dfunc(d)
            # Return True if the slope is a upper bound.
            return k * (lower - d) + func(d) >= func(lower)

        # Given an upper bound point (>=0), find a line that is guaranteed to
        # be a lower bound of this function.
        upper = self.step_pre * torch.arange(
            0, self.num_points_pre + 5, device=self.device)
        r = torch.zeros_like(upper)
        # Initial guess, the tangent line is at -1.
        l = -torch.ones_like(upper)
        while True:
            # Check if the tangent line at the guessed point is an lower bound at f(upper).
            checked = check_upper(upper, l).int()
            # If the initial guess is not smaller enough, then double it (-2, -4, etc).
            l = checked * l + (1 - checked) * (l * 2)
            if checked.sum() == l.numel():
                break
        # Now we have starting point at l, its tangent line is guaranteed to
        # be an lower bound at f(upper).
        # We want to further tighten this bound by moving it closer to 0.
        for _ in range(max_iter):
            # Binary search.
            m = (l + r) / 2
            checked = check_upper(upper, m).int()
            l = checked * m + (1 - checked) * l
            r = checked * r + (1 - checked) * m
        # At upper, a line with slope l is guaranteed to lower bound the function.
        self.d_upper = l.clone()

        # Do the same again:
        # Given an lower bound point (<=0), find a line that is guaranteed to
        # be an upper bound of this function.
        lower = -self.step_pre * torch.arange(
            0, self.num_points_pre + 5, device=self.device)
        l = torch.zeros_like(upper)
        r = torch.ones_like(upper)
        while True:
            checked = check_lower(lower, r).int()
            r = checked * r + (1 - checked) * (r * 2)
            if checked.sum() == l.numel():
                break
        for _ in range(max_iter):
            m = (l + r) / 2
            checked = check_lower(lower, m).int()
            l = (1 - checked) * m + checked * l
            r = (1 - checked) * r + checked * m
        self.d_lower = r.clone()

    def forward(self, x, y):
        return torch.pow(x, y)

    def bound_backward(self, last_lA, last_uA, x, y, start_node=None,
                       start_shape=None, **kwargs):
        assert not self.is_input_perturbed(1)
        self._start = start_node.name if start_node is not None else None
        y = y.value
        if y == int(y):
            x.upper = torch.max(x.upper, x.lower + 1e-8)
            self.exponent = int(y)
            assert self.exponent >= 2
            if self.exponent % 2:
                self.precompute_relaxation(self.act_func, self.d_act_func)

            As, lbias, ubias = super().bound_backward(
                last_lA, last_uA, x, start_node, start_shape, **kwargs)
            return [As[0], (None, None)], lbias, ubias
        else:
            raise NotImplementedError('Exponent is not supported yet')

    def bound_forward(self, dim_in, x, y):
        assert y.lower == y.upper == int(y.lower)
        y = y.lower
        x.upper = torch.max(x.upper, x.lower + 1e-8)
        self.exponent = int(y)

        assert self.exponent >= 2
        if self.exponent % 2:
            self.precompute_relaxation(self.act_func, self.d_act_func)
        return super().bound_forward(dim_in, x)

    def bound_relax_branch(self, lb, ub):
        if self.opt_stage in ['opt', 'reuse']:
            if not hasattr(self, 'alpha'):
                # Raise an error if alpha is not created.
                self._no_bound_parameters()
            ns = self._start

            self.alpha[ns].data[8:10] = torch.max(
                torch.min(self.alpha[ns][8:10], ub), lb)
            lb_point = self.alpha[ns][8:10]
            lower_slope = self.d_act_func(lb_point)
            lower_bias = self.act_func(lb_point) - lower_slope * lb_point
        else:
            lower_slope = 0
            lower_bias = 0

        upper_slope = (self.act_func(ub) - self.act_func(lb)) / (ub - lb).clamp(min=1e-8)
        upper_bias = self.act_func(ub) - ub * upper_slope
        return lower_slope, lower_bias, upper_slope, upper_bias

    def bound_relax(self, x, init=False, dim_opt=None):
        # For powers with odd exponents, such as x^3, the overall shape is inverse S-like.
        self.inverse_s_shape = self.exponent % 2 == 1
        if self.exponent % 2:
            self.inflections = [0.]
        else:
            self.extremes = [0.]
        super().bound_relax(x, init, dim_opt)

    def interval_propagate(self, *v):
        assert not self.is_input_perturbed(1)
        exp = v[1][0]
        assert exp == int(exp)
        exp = int(exp)
        pl, pu = torch.pow(v[0][0], exp), torch.pow(v[0][1], exp)
        if exp % 2 == 1:
            return pl, pu
        else:
            pl, pu = torch.min(pl, pu), torch.max(pl, pu)
            mask = 1 - ((v[0][0] < 0) * (v[0][1] > 0)).to(pl.dtype)
            return pl * mask, pu

    def clamp_interim_bounds(self):
        if self.exponent % 2 == 0:
            self.cstr_lower = self.lower.clamp(min=0)
            self.cstr_upper = self.upper.clamp(min=0)
            self.cstr_interval = (self.cstr_lower, self.cstr_upper)


def dtanh(x):
    return 1 - torch.tanh(x).pow(2)

def dsigmoid(x):
    return torch.sigmoid(x) * (1 - torch.sigmoid(x))

def darctan(x):
    return (x.square() + 1.).reciprocal()

def d2tanh(x):
    return -2 * torch.tanh(x) * (1 - torch.tanh(x).pow(2))

def d2sigmoid(x):
    return dsigmoid(x) * (1 - 2 * torch.sigmoid(x))


class BoundTanh(BoundSShaped):
    """
    BoundTanh is based on the S-shaped BoundSShaped. In the meantime, it works as the
    base class for other globally S-shaped functions such as Sigmoid and Atan.
    """
    def __init__(self, attr=None, inputs=None, output_index=0, options=None,
                 activation=('tanh', torch.tanh, dtanh), precompute=True):
        super().__init__(attr, inputs, output_index, options, activation, precompute)


    def _init_opt_parameters_impl(self, size_spec, name_start):
        """Implementation of init_opt_parameters for each start_node."""
        return super()._init_opt_parameters_impl(size_spec, name_start, num_params=8)

    def build_gradient_node(self, grad_upstream):
        node_grad = TanhGrad()
        grad_input = (grad_upstream, self.inputs[0].forward_value)
        grad_extra_nodes = [self.inputs[0]]
        return [(node_grad, grad_input, grad_extra_nodes)]


class TanhGradOp(Function):
    @staticmethod
    def symbolic(_, preact):
        return _.op('grad::Tanh', preact).setType(preact.type())
    
    @staticmethod
    def forward(ctx, preact):
        return 1 - torch.tanh(preact)**2


class TanhGrad(Module):
    def forward(self, g, preact):
        return g * TanhGradOp.apply(preact).unsqueeze(1)


class BoundTanhGrad(BoundOptimizableActivation):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None,
                 activation=('tanh', dtanh, d2tanh), precompute=True):
        super().__init__(attr, inputs, output_index, options)
        self.requires_input_bounds = [0]
        # The inflection point is where d2f/dx2 = 0.
        self.inflection_point = 0.6585026
        self.func = activation[1]
        self.dfunc = activation[2]
        if precompute:
            self.precompute_relaxation()

    def forward(self, x):
        return self.func(x)

    def interval_propagate(self, *v):
        lower, upper = v[0]
        f_lower = self.func(lower)
        f_upper = self.func(upper)
        next_lower = torch.min(f_lower, f_upper)
        next_upper = torch.max(f_lower, f_upper)
        mask_both = torch.logical_and(lower < 0, upper > 0)
        next_upper[mask_both] = self.func(torch.tensor(0))
        return next_lower, next_upper
    
    def bound_relax(self, x, init=False, dim_opt=None):
        if init:
            self.init_linear_relaxation(x, dim_opt)
        return self.bound_relax_impl(x)
    
    def precompute_relaxation(self, x_limit=500):
        """
        This function precomputes the tangent lines that will be used as
        the lower/upper bounds for bell-shaped functions.
        Three tensors are precomputed:
        - self.precompute_x: The x values of the upper preactivation bound.
        - self.d_lower: The tangent points of the lower bound.
        - self.d_upper: The tangent points of the upper bound.
        """

        self.x_limit = x_limit
        self.step_pre = 0.01
        self.num_points_pre = int(self.x_limit / self.step_pre)

        max_iter = 100
        func, dfunc = self.func, self.dfunc

        logger.debug('Precomputing relaxation for %s (pre-activation limit: %f)',
                     self.__class__.__name__, x_limit)

        def check_lower(upper, d):
            """Given two points upper, d (d <= upper),
            check if the slope at d will be less than f(upper) at upper."""
            k = dfunc(d)
            # Return True if the slope is a lower bound.
            return k * (upper - d) + func(d) <= func(upper)

        def check_upper(lower, d):
            """Given two points lower, d (d <= lower),
            check if the slope at d will be greater than f(lower) at lower."""
            k = dfunc(d)
            # Return True if the slope is a upper bound.
            return k * (lower - d) + func(d) >= func(lower)

        self.precompute_x = torch.arange(-self.x_limit, self.x_limit + self.step_pre, self.step_pre, device=self.device)
        self.d_lower = torch.zeros_like(self.precompute_x)
        self.d_upper = torch.zeros_like(self.precompute_x)

        # upper point that needs lower precomputed tangent line
        mask_need_d_lower = self.precompute_x >= -self.inflection_point
        upper = self.precompute_x[mask_need_d_lower] 
        # 1. Initial guess, the tangent is at -2*inflection_point (should be between (-inf, -inflection_point))
        r = -self.inflection_point * torch.ones_like(upper)
        l = -2 * self.inflection_point * torch.ones_like(upper)
        while True:
            # Check if the tangent line at the guessed point is an lower bound at f(upper).
            checked = check_lower(upper, l).int()
            # If the initial guess is not smaller enough, then double it (-2, -4, etc).
            l = checked * l + (1 - checked) * (l * 2)
            if checked.sum() == l.numel():
                break
        # Now we have starting point at l, its tangent line is guaranteed to be an lower bound at f(upper).
        # We want to further tighten this bound by moving it closer to upper.
        for _ in range(max_iter):
            # Binary search.
            m = (l + r) / 2
            checked = check_lower(upper, m).int()
            l = checked * m + (1 - checked) * l
            r = checked * r + (1 - checked) * m
        # At upper, a line with slope l is guaranteed to lower bound the function.
        self.d_lower[mask_need_d_lower] = l.clone()

        # upper point that needs upper precomputed tangent line
        mask_need_upper_d = self.precompute_x >= self.inflection_point
        upper = self.precompute_x[mask_need_upper_d]
        # 1. Initial guess, the tangent is at inflection_point/2 (should be between (0, inflection_point))
        r = self.inflection_point * torch.ones_like(upper)
        l = self.inflection_point / 2 * torch.ones_like(upper)
        while True:
            # Check if the tangent line at the guessed point is an upper bound at f(upper).
            checked = check_upper(upper, l).int()
            # If the initial guess is not smaller enough, then reduce it.
            l = checked * l + (1 - checked) * (l / 2)
            if checked.sum() == l.numel():
                break
        # Now we have starting point at l, its tangent line is guaranteed to be an upper bound at f(upper).
        # We want to further tighten this bound by moving it closer to upper.
        for _ in range(max_iter):
            # Binary search.
            m = (l + r) / 2
            checked = check_upper(upper, m).int()
            l = checked * m + (1 - checked) * l
            r = checked * r + (1 - checked) * m
        # At upper, a line with slope l is guaranteed to upper bound the function.
        self.d_upper[mask_need_upper_d] = l.clone()

    def retrieve_from_precompute(self, x, flip=False):
        if not flip:
            if x.max() > self.x_limit:
                warnings.warn(f'Pre-activation bounds are too loose for {self}')
            # Take the left endpoint of the interval
            x_indices = torch.searchsorted(self.precompute_x, x, right=True) - 1
            return self.d_lower[x_indices], self.d_upper[x_indices]
        else:
            if x.min() < -self.x_limit:
                warnings.warn(f'Pre-activation bounds are too loose for {self}')
            # Take the right endpoint of the interval
            x_indices = torch.searchsorted(self.precompute_x, -x, right=False)
            return -self.d_lower[x_indices], -self.d_upper[x_indices]
            

    def bound_relax_impl(self, x):
        lower, upper = x.lower, x.upper
        func, dfunc = self.func, self.dfunc
        y_l, y_u = func(lower), func(upper)
        # k_direct is the slope of the line directly connect (lower, func(lower)), (upper, func(upper)).
        k_direct = (y_u - y_l) / (upper - lower).clamp(min=1e-8)

        # The tangent line at the midpoint can be a good approximation
        midpoint = (lower + upper) / 2
        k_midpoint = dfunc(midpoint)
        y_midpoint = func(midpoint)

        # If -inflection_point <= lower < upper <= inflection_point,
        # we call it "completely concave" region.
        mask_completely_concave = torch.logical_and(
            lower >= -self.inflection_point,
            upper <= self.inflection_point
        )
        self.add_linear_relaxation(
            mask=mask_completely_concave, type='lower', k=k_direct, x0=lower, y0=y_l)
        self.add_linear_relaxation(
            mask=mask_completely_concave, type='upper', k=k_midpoint, x0=midpoint, y0=y_midpoint)
        
        # From now on, we assume at least one of the bounds is outside the completely concave region.
        # Without loss of generality, we assume upper > inflection_point (indicated by mask_right).
        mask_right = lower + upper >= 0

        dl, du = self.retrieve_from_precompute(upper, flip=False)
        dl_, du_ = self.retrieve_from_precompute(lower, flip=True)

        # Case 1: Similar to a convex function
        mask_case1 = torch.logical_or(
            torch.logical_and(mask_right, lower >= self.inflection_point),
            torch.logical_and(torch.logical_not(mask_right), upper <= -self.inflection_point)
        )
        self.add_linear_relaxation(
            mask=mask_case1, type='upper', k=k_direct, x0=lower, y0=y_l)
        self.add_linear_relaxation(
            mask=mask_case1, type='lower', k=k_midpoint, x0=midpoint, y0=y_midpoint)
        
        # Case 2: Similar to a S-shaped function
        mask_case2_right = torch.logical_and(mask_right, torch.logical_and(
            upper > self.inflection_point, lower < self.inflection_point))
        # The upper tangent point is lineraly interpolated between 0 and du,
        # given lower ranging between -upper and du.
        d_mask_case2_right_upper = du * (lower + upper) / (du + upper)
        k_mask_case2_right_upper = dfunc(d_mask_case2_right_upper)
        y_mask_case2_right_upper = func(d_mask_case2_right_upper)
        self.add_linear_relaxation(
            mask=mask_case2_right, type='upper',
            k=k_mask_case2_right_upper, x0=d_mask_case2_right_upper, y0=y_mask_case2_right_upper)
        # The lower tangent point is found based on lower.
        d_mask_case2_right_lower = (dl_ + upper) / 2
        k_mask_case2_right_lower = dfunc(d_mask_case2_right_lower)
        y_mask_case2_right_lower = func(d_mask_case2_right_lower)
        self.add_linear_relaxation(
            mask=torch.logical_and(mask_case2_right, dl_ < upper), type='lower',
            k=k_mask_case2_right_lower, x0=d_mask_case2_right_lower, y0=y_mask_case2_right_lower)
        self.add_linear_relaxation(
            mask=torch.logical_and(mask_case2_right, dl_ >= upper), type='lower',
            k=k_direct, x0=lower, y0=y_l)

        mask_case2_left = torch.logical_and(torch.logical_not(mask_right), torch.logical_and(
            lower < -self.inflection_point, upper > -self.inflection_point))
        # The upper tangent point is lineraly interpolated between du_ and 0,
        # given upper ranging between du_ and -lower.
        d_mask_case2_left_upper = du_ * (upper + lower) / (du_ + lower)
        k_mask_case2_left_upper = dfunc(d_mask_case2_left_upper)
        y_mask_case2_left_upper = func(d_mask_case2_left_upper)
        self.add_linear_relaxation(
            mask=mask_case2_left, type='upper',
            k=k_mask_case2_left_upper, x0=d_mask_case2_left_upper, y0=y_mask_case2_left_upper)
        # The lower tangent point is found based on upper.
        d_mask_case2_left_lower = (dl + lower) / 2
        k_mask_case2_left_lower = dfunc(d_mask_case2_left_lower)
        y_mask_case2_left_lower = func(d_mask_case2_left_lower)
        self.add_linear_relaxation(
            mask=torch.logical_and(mask_case2_left, dl > lower), type='lower',
            k=k_mask_case2_left_lower, x0=d_mask_case2_left_lower, y0=y_mask_case2_left_lower)
        self.add_linear_relaxation(
            mask=torch.logical_and(mask_case2_left, dl <= lower), type='lower',
            k=k_direct, x0=upper, y0=y_u)
        
        # If the lower and upper bounds are too close, we just use IBP bounds to avoid numerical issues.
        mask_very_close = upper - lower < 1e-6
        if mask_very_close.any():
            self.add_linear_relaxation(
                mask=torch.logical_and(mask_very_close, self.mask_neg), type='lower', k=0, x0=lower, y0=y_l)
            self.add_linear_relaxation(
                mask=torch.logical_and(mask_very_close, self.mask_neg), type='upper', k=0, x0=upper, y0=y_u)
            self.add_linear_relaxation(
                mask=torch.logical_and(mask_very_close, self.mask_pos), type='lower', k=0, x0=upper, y0=y_u)
            self.add_linear_relaxation(
                mask=torch.logical_and(mask_very_close, self.mask_pos), type='upper', k=0, x0=lower, y0=y_l)
            self.add_linear_relaxation(
                mask=torch.logical_and(mask_very_close, self.mask_both), type='lower', k=0, x0=lower, y0=torch.min(y_l, y_u))
            self.add_linear_relaxation(
                mask=torch.logical_and(mask_very_close, self.mask_both), type='upper', k=0, x0=upper, y0=torch.full_like(y_l, func(torch.tensor(0))))


class BoundSigmoid(BoundTanh):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options,
                         activation=('sigmoid', torch.sigmoid, dsigmoid))
    
    def build_gradient_node(self, grad_upstream):
        node_grad = SigmoidGrad()
        grad_input = (grad_upstream, self.inputs[0].forward_value)
        grad_extra_nodes = [self.inputs[0]]
        return [(node_grad, grad_input, grad_extra_nodes)]


class SigmoidGradOp(Function):
    @staticmethod
    def symbolic(_, preact):
        return _.op('grad::Sigmoid', preact).setType(preact.type())
    
    @staticmethod
    def forward(ctx, preact):
        sigmoid_x = torch.sigmoid(preact)
        return sigmoid_x * (1 - sigmoid_x)


class SigmoidGrad(Module):
    def forward(self, g, preact):
        return g * SigmoidGradOp.apply(preact).unsqueeze(1)


class BoundSigmoidGrad(BoundTanhGrad):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None,
                 activation=('sigmoid', dsigmoid, d2sigmoid), precompute=True):
        super().__init__(attr, inputs, output_index, options, activation, precompute=False)
        self.inflection_point = 1.3169614
        if precompute:
            self.precompute_relaxation()


class BoundAtan(BoundTanh):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options,
                         activation=('arctan', torch.arctan, darctan))
        self.split_range = (-torch.inf, torch.inf)

    def build_gradient_node(self, grad_upstream):
        node_grad = AtanGrad()
        grad_input = (grad_upstream, self.inputs[0].forward_value)
        grad_extra_nodes = [self.inputs[0]]
        return [(node_grad, grad_input, grad_extra_nodes)]


class AtanGrad(Module):
    def forward(self, g, preact):
        # arctan'(x) = 1 / (1 + x^2)
        return g / (1 + preact.square()).unsqueeze(1)


class BoundTan(BoundAtan):
    """
    The implementation of BoundTan is based on the S-shaped BoundAtan. We use the bounds from its
    inverse function and directly convert the bounds of the inverse function to bounds of the original
    function. This trick allows us to quickly implement bounds on inverse functions.
    """

    def forward(self, x):
        return torch.tan(x)

    def _check_bounds(self, lower, upper):
        # Lower and upper bounds must be within the same [-½π, ½π] region.
        lower_periods = torch.floor((lower + 0.5 * torch.pi) / torch.pi)
        upper_periods = torch.floor((upper + 0.5 * torch.pi) / torch.pi)
        if not torch.allclose(lower_periods, upper_periods):
            print('Tan preactivation lower bounds:\n', lower)
            print('Tan preactivation upper bounds:\n', upper)
            raise ValueError("BoundTan received pre-activation bounds that produce infinity. "
                    "The preactivation bounds are too loose. Try to reduce perturbation region.")
        # Return the period number for each neuron.
        # Period is 0 => bounds are within [-½π, ½π],
        # Period is 1 => bounds are within [-½π + π, ½π + π]
        # Period is -1 => bounds are within [-½π - π, ½π - π]
        return lower_periods

    def _init_masks(self, x):
        # The masks now must consider the periodicity.
        lower = torch.remainder(x.lower + 0.5 * torch.pi, torch.pi) - 0.5 * torch.pi
        upper = torch.remainder(x.upper + 0.5 * torch.pi, torch.pi) - 0.5 * torch.pi
        self.mask_pos = lower >= 0
        self.mask_neg = upper <= 0
        self.mask_both = torch.logical_not(torch.logical_or(self.mask_pos, self.mask_neg))

    def interval_propagate(self, *v):
        # We need to check if the input lower and upper bounds are within the same period.
        # Otherwise the bounds become infinity.
        concrete_lower, concrete_upper = v[0][0], v[0][1]
        self._check_bounds(concrete_lower, concrete_upper)
        return super().interval_propagate(*v)

    def bound_relax(self, x, init=False, dim_opt=None):
        if init:
            self.init_linear_relaxation(x, dim_opt)
        periods = self._check_bounds(x.lower, x.upper)
        periods = torch.pi * periods
        # Create a fake x with inversed lower and upper.
        inverse_x = lambda: None
        inverse_x.lower = torch.tan(x.lower)
        inverse_x.upper = torch.tan(x.upper)
        super().bound_relax(inverse_x, init=init, dim_opt=dim_opt)
        # Lower slope, lower bias, upper slope and upper bias are saved to
        # self.lw, self.lb, self.uw, self.ub. We need to reverse them.
        # E.g., y = self.lw * x + self.lb, now becomes x = 1./self.lw * y - self.lb / self.lw
        # Additionally, we need to add the missing ½π periods.
        new_upper_slope = 1. / self.lw
        new_upper_bias = - self.lb / self.lw - periods / self.lw
        new_lower_slope = 1. / self.uw
        new_lower_bias = - self.ub / self.uw - periods / self.uw

        # NaN can happen if lw=0 or uw=0 when the pre-activation bounds are too close
        # Replace the bounds with interval bounds.
        if (self.lw == 0).any():
            mask = self.lw == 0
            new_upper_slope[mask] = 0
            new_upper_bias[mask] = inverse_x.upper[mask]
        if (self.uw == 0).any():
            mask = self.uw == 0
            new_lower_slope[mask] = 0
            new_lower_bias[mask] = inverse_x.lower[mask]

        self.lw = new_lower_slope
        self.lb = new_lower_bias
        self.uw = new_upper_slope
        self.ub = new_upper_bias


================================================
FILE: auto_LiRPA/operators/shape.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
""" Shape operators """
from .base import *


class BoundShape(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.never_perturbed = True

    @staticmethod
    def shape(x):
        return x.shape if isinstance(x, Tensor) else torch.tensor(x).shape

    def forward(self, x):
        self.from_input = False
        return BoundShape.shape(x)

    def bound_forward(self, dim_in, x):
        return self.forward_value

    def build_solver(self, *v, model, C=None, model_type="mip", solver_pkg="gurobi"):
        if not isinstance(v[0], Tensor):
            # e.g., v[0] input shape (8, 7, 7) => output its shape (1, 8, 7, 7)
            gvars_array = np.array(v[0])
            self.solver_vars = torch.tensor(np.expand_dims(gvars_array, axis=0).shape).long()
        else:
            self.solver_vars = torch.tensor(self.forward(v[0])).long()


================================================
FILE: auto_LiRPA/operators/slice_concat.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
""" Shape operators """
from torch.nn import Module
from torch.autograd import Function
from .base import *
from ..patches import Patches
from .constant import BoundConstant


class BoundConcat(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.axis = attr['axis']
        self.IBP_rets = None
        self.ibp_intermediate = True

    def forward(self, *x):  # x is a list of tensors
        x = [(item if isinstance(item, Tensor) else torch.tensor(item)) for item in x]
        self.input_size = [item.shape[self.axis] for item in x]
        self.axis = self.make_axis_non_negative(self.axis)
        return torch.cat(x, dim=int(self.axis))

    def interval_propagate(self, *v):
        norms = []
        eps = []
        # Collect perturbation information for all inputs.
        for i, _v in enumerate(v):
            if self.is_input_perturbed(i):
                n, e = Interval.get_perturbation(_v)
                norms.append(n)
                eps.append(e)
            else:
                norms.append(None)
                eps.append(0.0)
        eps = np.array(eps)
        # Supporting two cases: all inputs are Linf norm, or all inputs are L2 norm perturbed.
        # Some inputs can be constants without perturbations.
        all_inf = all(map(lambda x: x is None or x == torch.inf, norms))
        all_2 = all(map(lambda x: x is None or x == 2, norms))

        h_L = [_v[0] for _v in v]
        h_U = [_v[1] for _v in v]
        if all_inf:
            # Simply returns a tuple. Every subtensor has its own lower and upper bounds.
            return self.forward(*h_L), self.forward(*h_U)
        elif all_2:
            # Sum the L2 norm over all subtensors, and use that value as the new L2 norm.
            # This will be an over-approximation of the original perturbation (we can prove it).
            max_eps = np.sqrt(np.sum(eps * eps))
            # For L2 norm perturbed inputs, lb=ub and for constants lb=ub. Just propagate one object.
            r = self.forward(*h_L)
            ptb = PerturbationLpNorm(norm=2, eps=max_eps)
            return Interval(r, r, ptb=ptb)
        else:
            raise RuntimeError(f"BoundConcat does not support inputs with norm {norms}")

    def bound_backward(self, last_lA, last_uA, *x, **kwargs):
        self.axis = self.make_axis_non_negative(self.axis, 'output')
        assert self.axis > 0

        def _bound_oneside(last_A):
            if last_A is None:
                return None
            if isinstance(last_A, torch.Tensor):
                ret = list(torch.split(last_A, self.input_size, dim=self.axis + 1))
                # Skip unused input nodes to reduce the cost of computing unused intermediate bounds
                for i in range(len(ret)):
                    if (ret[i] == 0).all():
                        ret[i] = None
                return ret
            elif isinstance(last_A, Patches):
                assert len(self.input_shape) == 4 and self.axis == 1, "Split channel dimension is supported; others are unimplemented."
                # Patches shape can be [out_c, batch, out_h, out_w, in_c, patch_h, patch_w]
                # Or [spec, batch, in_c, patch_h, patch_w]  (sparse)
                new_patches = torch.split(last_A.patches, self.input_size, dim=-3)  # split the in_c dimension is easy.
                return [last_A.create_similar(p) for p in new_patches]
            else:
                raise RuntimeError(f'Unsupported type for last_A: {type(last_A)}')

        uA = _bound_oneside(last_uA)
        lA = _bound_oneside(last_lA)

        if uA is None:
            return [(lA[i] if lA is not None else None, None)
                    for i in range(len(lA))], 0, 0
        if lA is None:
            return [(None, uA[i] if uA is not None else None)
                    for i in range(len(uA))], 0, 0

        # To avoid issues in other parts of the code, we prune unused
        # lA and uA only when they are both unused.
        for i in range(len(lA)):
            if lA[i] is None and uA[i] is not None:
                lA[i] = torch.zeros_like(uA[i])
            elif lA[i] is not None and uA[i] is None:
                uA[i] = torch.zeros_like(lA[i])

        return [(lA[i], uA[i]) for i in range(len(lA))], 0, 0

    def bound_forward(self, dim_in, *x):
        self.axis = self.make_axis_non_negative(self.axis)
        assert (self.axis == 0 and not self.from_input or self.from_input)
        # Concatenate each input's bounds along the axis.
        # If x[i].lw and x[i].uw is None, it means the input is a constant,
        # so we concatenate a tensor of zeros with the corresponding shape.
        lw = torch.cat([item.lw if item.lw is not None else
                        torch.zeros(item.lb.shape[0], dim_in, *item.lb.shape[1:], device=item.lb.device)
                        for item in x], dim=self.axis + 1)
        lb = torch.cat([item.lb for item in x], dim=self.axis)
        uw = torch.cat([item.uw if item.uw is not None else
                        torch.zeros(item.ub.shape[0], dim_in, *item.ub.shape[1:], device=item.ub.device)
                        for item in x], dim=self.axis + 1)
        ub = torch.cat([item.ub for item in x], dim=self.axis)
        return LinearBound(lw, lb, uw, ub)

    def build_solver(self, *v, model, C=None, model_type="mip", solver_pkg="gurobi"):
        self.solver_vars = self.forward(*v)

    def build_gradient_node(self, grad_upstream):
        ret = []
        for i in range(len(self.inputs)):
            node_grad = ConcatGrad(self.axis, i)
            grad_input = (grad_upstream, ) + tuple(inp.forward_value for inp in self.inputs)
            ret.append((node_grad, grad_input, []))
        return ret


BoundConcatFromSequence = BoundConcat


class BoundSlice(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.start = attr["starts"][0] if "starts" in attr else None
        self.end = attr["ends"][0] if "ends" in attr else None
        self.axes = attr["axes"][0] if "axes" in attr else None
        self.use_default_ibp = False
        self.ibp_intermediate = True

    def __repr__(self):
        attrs = {}
        if (len(self.inputs) == 5
            and all(isinstance(item, BoundConstant) and item.value.numel() == 1
                    for item in self.inputs[1:])):
            attrs['start'] = self.inputs[1].value.item()
            attrs['end'] = self.inputs[2].value.item()
            attrs['axes'] = self.inputs[3].value.item()
            attrs['step'] = self.inputs[4].value.item()
        return super().__repr__(attrs)

    def _fixup_params(self, shape, start, end, axes, steps):
        if start < 0:
            start += shape[axes]
        if end < 0:
            if end == -9223372036854775807:  # -inf in ONNX
                end = 0  # only possible when step == -1
            else:
                end += shape[axes]
        if steps == -1:
            start, end = end, start + 1  # TODO: more test more negative step size.
        end = min(end, shape[axes])
        return start, end

    # Older Pytorch version only passes steps as input.
    def forward(self, x, start=None, end=None, axes=None, steps=1):
        start = self.start if start is None else start
        end = self.end if end is None else end
        axes = self.axes if axes is None else axes
        assert (steps == 1 or steps == -1) and axes == int(axes) and start == int(start) and end == int(end)
        shape = x.shape if isinstance(x, Tensor) else [len(x)]
        start, end = self._fixup_params(shape, start, end, axes, steps)
        final = torch.narrow(x, dim=int(axes), start=int(start), length=int(end - start))
        if steps == -1:
            final = torch.flip(final, dims=tuple(axes))
        return final

    def interval_propagate(self, *v):
        lb = tuple(map(lambda x:x[0],v))
        ub = tuple(map(lambda x:x[1],v))
        return Interval.make_interval(self.forward(*lb), self.forward(*ub))

    def build_solver(self, *v, model, C=None, model_type="mip", solver_pkg="gurobi"):
        self.solver_vars = self.forward(*v)

    def bound_backward(self, last_lA, last_uA, *x, **kwargs):
        def _bound_oneside(A, start, end, axes, steps):
            if A is None:
                return None
            if isinstance(A, torch.Tensor):
                # Reuse the batch and spec dimension of A, and replace other shapes with input.
                A_shape = A.shape[:2] + self.input_shape[1:]
                new_A = torch.zeros(size=A_shape, device=A.device,
                                    requires_grad=A.requires_grad)
                # Fill part of the new_A based on start, end, axes and steps.
                # Skip the spec dimension at the front (axes + 1).
                dim = axes if axes < 0 else axes + 1
                indices = torch.arange(start, end, device=A.device)
                new_A = torch.index_copy(new_A, dim=dim, index=indices, source=A)
            elif isinstance(A, Patches):
                assert A.unstable_idx is None
                assert len(self.input_shape) == 4 and axes == 1, "Slice is only supported on channel dimension."
                patches = A.patches
                # patches shape is [out_c, batch, out_h, out_w, in_c, patch_h, patch_w].
                new_patches_shape = patches.shape[:4] + (self.input_shape[1], ) + patches.shape[-2:]
                new_patches = torch.zeros(
                    size=new_patches_shape, device=patches.device,
                    requires_grad=patches.requires_grad)
                indices = torch.arange(start, end, device=patches.device)
                new_patches = torch.index_copy(new_patches, dim=-3, index=indices, source=patches)
                # Only the in_c dimension is changed.
                new_A = A.create_similar(new_patches)
            else:
                raise ValueError(f'Unsupport A type {type(A)}')
            return new_A

        start, end, axes = x[1].value.item(), x[2].value.item(), x[3].value.item()
        steps = x[4].value.item() if len(x) == 5 else 1  # If step is not specified, it is 1.
        # Other step size untested, do not enable for now.
        assert steps == 1 and axes == int(axes) and start == int(start) and end == int(end)
        start, end = self._fixup_params(self.input_shape, start, end, axes, steps)
        # Find the original shape of A.
        lA = _bound_oneside(last_lA, start, end, axes, steps)
        uA = _bound_oneside(last_uA, start, end, axes, steps)
        return [(lA, uA), (None, None), (None, None), (None, None), (None, None)], 0, 0

    def bound_forward(self, dim_in, *inputs):
        assert len(inputs) == 5 or len(inputs) == 4
        start = inputs[1].lb.item()
        end = inputs[2].lb.item()
        axis = self.make_axis_non_negative(inputs[3].lb.item())
        assert axis > 0, "Slicing along the batch dimension is not supported yet"
        steps = inputs[4].lb.item() if len(inputs) == 5 else 1  # If step is not specified, it is 1.
        assert steps in [1, -1]
        x = inputs[0]
        shape = x.lb.shape
        start, end = self._fixup_params(shape, start, end, axis, steps)
        lw = torch.narrow(x.lw, dim=axis+1, start=start, length=end - start)
        uw = torch.narrow(x.uw, dim=axis+1, start=start, length=end - start)
        lb = torch.narrow(x.lb, dim=axis, start=start, length=end - start)
        ub = torch.narrow(x.ub, dim=axis, start=start, length=end - start)
        if steps == -1:
            lw = torch.flip(lw, dims=tuple(axis+1))
            uw = torch.flip(uw, dims=tuple(axis+1))
            lb = torch.flip(lb, dims=tuple(axis))
            ub = torch.flip(ub, dims=tuple(axis))
        return LinearBound(lw, lb, uw, ub)

    def build_gradient_node(self, grad_upstream):
        assert len(self.inputs) == 5
        start = self.inputs[1].value.item()
        end = self.inputs[2].value.item()
        axes = self.inputs[3].value.item()
        steps = self.inputs[4].value.item()
        assert steps == 1
        node_grad = SliceGrad(start, end, axes, steps)
        grad_input = (grad_upstream, self.inputs[0].forward_value)
        return [(node_grad, grad_input, [])]


class BoundSplit(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.axis = attr['axis']
        self.use_default_ibp = True
        if 'split' in attr:
            self.split = attr['split']
        else:
            self.split = None

    def forward(self, *x):
        data = x[0]
        split = self.split if self.split is not None else x[1].tolist()
        if self.axis == -1:
            self.axis = len(data.shape) - 1
        return torch.split(data, split, dim=self.axis)[self.output_index]

    def bound_backward(self, last_lA, last_uA, *x, **kwargs):
        assert self.axis > 0
        split = self.split if self.split is not None else x[1].value.tolist()
        pre = sum(split[:self.output_index])
        suc = sum(split[(self.output_index + 1):])

        def _bound_oneside(last_A):
            if last_A is None:
                return None
            A = []
            if pre > 0:
                A.append(torch.zeros(
                    *last_A.shape[:(self.axis + 1)], pre, *last_A.shape[(self.axis + 2):],
                    device=last_A.device))
            A.append(last_A)
            if suc > 0:
                A.append(torch.zeros(
                    *last_A.shape[:(self.axis + 1)], suc, *last_A.shape[(self.axis + 2):],
                    device=last_A.device))
            return torch.cat(A, dim=self.axis + 1)

        return [(_bound_oneside(last_lA), _bound_oneside(last_uA)), (None, None)], 0, 0

    def bound_forward(self, dim_in, *x):
        assert self.axis > 0 and self.from_input
        split = self.split if self.split is not None else x[1].lb.tolist()
        x = x[0]
        lw = torch.split(x.lw, split, dim=self.axis + 1)[self.output_index]
        uw = torch.split(x.uw, split, dim=self.axis + 1)[self.output_index]
        lb = torch.split(x.lb, split, dim=self.axis)[self.output_index]
        ub = torch.split(x.ub, split, dim=self.axis)[self.output_index]
        return LinearBound(lw, lb, uw, ub)

    def build_solver(self, *v, model, C=None, model_type="mip", solver_pkg="gurobi"):
        self.solver_vars = self.forward(v[0])


def slice_grad(x, input_shape, start, end, axes, steps):
    assert steps == 1
    assert axes > 0
    out = torch.zeros(*x.shape[:2], *input_shape[1:]).to(x)
    end = min(end, input_shape[axes])
    index = torch.arange(start, end, device=x.device)
    # Make index.ndim == x.ndim
    index = index.view(
        *((1,) * (axes + 1)),
        end - start,
        *((1,) * (x.ndim - axes - 2)))
    # Make index.shape == x.shape
    index = index.repeat(
        *x.shape[:axes + 1],
        1,
        *x.shape[axes + 2:]
    )
    out.scatter_(axes + 1, index, x)
    return out


class SliceGradOp(Function):
    """ Local gradient of BoundSlice.

    Not including multiplication with gradients from other layers.
    """
    @staticmethod
    def symbolic(_, grad_last, input, start=None, end=None, axes=None, steps=1):
        return _.op(
            'grad::Slice', grad_last, input,
            start_i=start, end_i=end, axes_i=axes, steps_i=steps
        ).setType(grad_last.type())

    @staticmethod
    def forward(ctx, grad_last, input, start, end, axes, steps):
        return slice_grad(grad_last, input.shape, start, end, axes, steps)


class SliceGrad(Module):
    def __init__(self, start, end, axes, steps):
        super().__init__()
        self.start = start
        self.end = end
        self.axes = axes
        self.steps = steps

    def forward(self, grad_last, input):
        return SliceGradOp.apply(
            grad_last, input,
            self.start, self.end, self.axes, self.steps)


class BoundSliceGrad(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.start = attr['start']
        self.end = attr['end']
        self.axes = attr['axes']
        self.steps = attr['steps']
        self.use_default_ibp = True

    def forward(self, grad_last, input):
        return slice_grad(grad_last, input.shape,
                          self.start, self.end, self.axes, self.steps)

    def bound_backward(self, last_lA, last_uA, *args, **kwargs):
        def _bound_oneside(last_A):
            if last_A is None:
                return None
            assert self.axes > 0
            last_A_ = last_A.reshape(-1, *self.inputs[1].output_shape[self.axes:])
            last_A_ = last_A_[:, self.start:self.end]
            last_A = last_A_.reshape(
                *last_A.shape[:self.axes+2], -1,
                *self.inputs[1].output_shape[self.axes+1:])
            return last_A
        return [(_bound_oneside(last_lA), _bound_oneside(last_uA)),
                (None, None)], 0, 0


def concat_grad(x, axis, input_index, *inputs):
    cur = 0
    for i in range(input_index):
        cur += inputs[i].shape[axis]
    x_ = x.reshape(-1, *x.shape[axis + 1:])
    ret = x_[:, cur:cur+inputs[input_index].shape[axis]]
    ret = ret.reshape(*x.shape[:axis + 1], *ret.shape[1:])
    return ret


class ConcatGradOp(Function):
    @staticmethod
    def symbolic(_, grad_last, axis, input_index, *inputs):
        return _.op('grad::Concat', grad_last, *inputs,
                    axis_i=axis, input_index_i=input_index).setType(grad_last.type())

    @staticmethod
    def forward(ctx, grad_last, axis, input_index, *inputs):
        return concat_grad(grad_last, axis, input_index, *inputs)


class ConcatGrad(Module):
    def __init__(self, axis, input_index):
        super().__init__()
        self.input_index = input_index
        self.axis = axis

    def forward(self, grad_last, *input):
        return ConcatGradOp.apply(grad_last, self.axis, self.input_index, *input)


class BoundConcatGrad(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.axis = attr['axis']
        self.input_index = attr['input_index']
        self.use_default_ibp = True

    def forward(self, grad_last, *inputs):
        return concat_grad(grad_last, self.axis, self.input_index, *inputs)

    def bound_backward(self, last_lA, last_uA, *args, **kwargs):
        def _bound_oneside(last_A):
            if last_A is None:
                return None
            assert self.axis > 0
            start = sum([self.inputs[i + 1].output_shape[self.axis]
                         for i in range(self.input_index)])
            end = start + self.output_shape[self.axis+1]
            shape_behind = self.inputs[0].output_shape[self.axis+1:]
            A = torch.zeros(*last_A.shape[:self.axis+2], *shape_behind, device=last_A.device)
            A = A.view(-1, *shape_behind)
            A[:, start:end] = last_lA.reshape(-1, *last_A.shape[self.axis+2:])
            A = A.view(*last_A.shape[:self.axis+2], *shape_behind)
            return A

        return ([(_bound_oneside(last_lA), _bound_oneside(last_uA))]
                + [(None, None)] * (len(self.inputs) - 1)), 0, 0

================================================
FILE: auto_LiRPA/operators/softmax.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
""" Softmax """
from .base import *

class BoundSoftmaxImpl(nn.Module):
    def __init__(self, axis):
        super().__init__()
        self.axis = axis
        assert self.axis == int(self.axis)

    def forward(self, x):
        max_x = torch.max(x, dim=self.axis).values
        x = torch.exp(x - max_x.unsqueeze(self.axis))
        s = torch.sum(x, dim=self.axis, keepdim=True)
        return x / s

# The `option != 'complex'` case is not used in the auto_LiRPA main paper.
class BoundSoftmax(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.axis = attr['axis']
        self.option = options.get('softmax', 'complex')
        if self.option == 'complex':
            self.complex = True
        else:
            self.max_input = 30

    def forward(self, x):
        assert self.axis == int(self.axis)
        if self.option == 'complex':
            self.input = (x,)
            self.model = BoundSoftmaxImpl(self.axis)
            self.model.device = self.device
            return self.model(x)
        else:
            return F.softmax(x, dim=self.axis)

    def interval_propagate(self, *v):
        assert self.option != 'complex'
        assert self.perturbed
        h_L, h_U = v[0]
        shift = h_U.max(dim=self.axis, keepdim=True).values
        exp_L, exp_U = torch.exp(h_L - shift), torch.exp(h_U - shift)
        lower = exp_L / (torch.sum(exp_U, dim=self.axis, keepdim=True) - exp_U + exp_L + epsilon)
        upper = exp_U / (torch.sum(exp_L, dim=self.axis, keepdim=True) - exp_L + exp_U + epsilon)
        return lower, upper


================================================
FILE: auto_LiRPA/operators/solver_utils.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
class DummyGurobipyClass:
    """A dummy class with error message when gurobi is not installed."""
    def __getattr__(self, attr):
        def _f(*args, **kwargs):
            raise RuntimeError(f"method {attr} not available because gurobipy module was not built.")
        return _f

try:
    import gurobipy as grb
except ModuleNotFoundError:
    grb = DummyGurobipyClass()

================================================
FILE: auto_LiRPA/operators/tile.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
"""BoundTile"""
from torch.nn import Module
from .base import *

class BoundTile(Bound):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.use_default_ibp = True
    
    def forward(self, x, repeats):
        return x.repeat(repeats.tolist())

    def bound_backward(self, last_lA, last_uA, *x, **kwargs):
        assert not self.is_input_perturbed(1)
        repeats = x[1].value

        def _bound_oneside(A):
            if A is None:
                return None
            # block_shape: (specs, d1/r1, r1, d2/r2, r2, ..., dn/rn, rn)
            # Reshaping A to block_shape and sum along the "r" dimensions
            # is equivalent to summing up all block fragments of A.
            block_shape = [A.shape[0]]
            axes_to_sum = []
            for i in range(len(repeats)):
                block_shape.append(A.size(i + 1) // repeats[i].item())
                block_shape.append(repeats[i].item())
                axes_to_sum.append(2 * i + 2)
            reshaped_A = A.reshape(*block_shape)
            next_A = reshaped_A.sum(dim=axes_to_sum)
            return next_A

        return [(_bound_oneside(last_lA), _bound_oneside(last_uA)), (None, None)], 0, 0

    def bound_forward(self, dim_in, *x):
        assert (x[1].lb == x[1].ub).all(), "repeats should be constant."
        repeats = x[1].lb.tolist()
        assert repeats[0] == 1, "shouldn't repeat on the batch dimension."
        # lb and ub have the same shape as x, so we repeat then with "repeats"
        lb = x[0].lb.repeat(repeats)
        ub = x[0].ub.repeat(repeats)
        # lw and uw have shape (batch_size, input_dim, *shape_of_the_current_layer)
        # so we need to repeat them with "repeats" as well, but we need to
        # insert 1 at the second position to keep the input dimension unchanged.
        repeats.insert(1, 1)
        lw = x[0].lw.repeat(repeats)
        uw = x[0].uw.repeat(repeats)
        return LinearBound(lw, lb, uw, ub)


================================================
FILE: auto_LiRPA/operators/trigonometric.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
from types import SimpleNamespace

import torch
from torch.autograd import Function

from .activation_base import BoundActivation
from .s_shaped import BoundSShaped


class BoundSin(BoundSShaped):
    # Lookup tables shared by all BoundSin classes.
    xl_lower_tb = None
    xl_upper_tb = None
    xu_lower_tb = None
    xu_upper_tb = None
    func, d_func = torch.sin, torch.cos
    n_table_entries = 1001

    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.ibp_intermediate = True
        self.act_func = torch.sin
        self.d_act_func = torch.cos

        # Bound limits used by IBP.
        self.ibp_max_point = torch.pi / 2
        self.ibp_min_point = torch.pi * 3 / 2

        self.all_table_x = torch.linspace(
            0, 2 * torch.pi, BoundSin.n_table_entries, device=self.device)
        self.precompute_relaxation(self.act_func, self.d_act_func, x_limit = torch.pi / 2)
        if BoundSin.xl_lower_tb is None:
            # Generate look-up tables.
            BoundSin.xl_lower_tb = BoundSin.get_lower_left_bound(self.all_table_x)
            BoundSin.xl_upper_tb = BoundSin.get_upper_left_bound(self.all_table_x)
            BoundSin.xu_lower_tb = BoundSin.get_lower_right_bound(self.all_table_x)
            BoundSin.xu_upper_tb = BoundSin.get_upper_right_bound(self.all_table_x)

    def d2_act_func(self, x):
        return -torch.sin(x)

    def _init_opt_parameters_impl(self, size_spec, name_start):
        """Implementation of init_opt_parameters for each start_node."""
        l, u = self.inputs[0].lower, self.inputs[0].upper
        shape = [size_spec] + list(l.shape)
        alpha = torch.empty(12, *shape, device=l.device)
        alpha.data[:4] = ((l + u) / 2).unsqueeze(0).expand(4, *shape)
        alpha.data[4:6] = self.tp_both_lower_init[name_start].expand(2, *shape)
        alpha.data[6:8] = self.tp_both_upper_init[name_start].expand(2, *shape)
        alpha.data[8:10] = self.tp_lower_init[name_start].expand(2, *shape)
        alpha.data[10:12] = self.tp_upper_init[name_start].expand(2, *shape)
        return alpha

    def opt_init(self):
        super().opt_init()
        self.tp_both_lower_init = {}
        self.tp_both_upper_init = {}
        self.tp_lower_init = {}
        self.tp_upper_init = {}

    def branch_input_domain(self, lb, ub):
        # Map all input lower and upper bounds to the [0, 2*pi] interval.
        lb_clamped = lb - torch.floor(lb / (2 * torch.pi)) * (2 * torch.pi)
        ub_clamped = ub - torch.floor(ub / (2 * torch.pi)) * (2 * torch.pi)

        # Mask the mapped lower and upper bounds according to whether they are in [0, 0.5*pi), [0.5*pi, pi),
        # [pi, 1.5*pi), or [1.5*pi, 2*pi).
        mask_lb_1 = torch.logical_and(lb_clamped >= 0, lb_clamped < torch.pi / 2)
        mask_lb_2 = torch.logical_and(lb_clamped >= torch.pi / 2, lb_clamped < torch.pi)
        mask_lb_3 = torch.logical_and(lb_clamped >= torch.pi, lb_clamped < 3 * torch.pi / 2)
        mask_lb_4 = torch.logical_and(lb_clamped >= 3 * torch.pi / 2, lb_clamped < 2 * torch.pi)

        mask_ub_1 = torch.logical_and(ub_clamped >= 0, ub_clamped < torch.pi / 2)
        mask_ub_2 = torch.logical_and(ub_clamped >= torch.pi / 2, ub_clamped < torch.pi)
        mask_ub_3 = torch.logical_and(ub_clamped >= torch.pi, ub_clamped < 3 * torch.pi / 2)
        mask_ub_4 = torch.logical_and(ub_clamped >= 3 * torch.pi / 2, ub_clamped < 2 * torch.pi)

        self.sigmoid_like_mask = torch.logical_and(
            ub - lb <= torch.pi,
            torch.logical_or(
                torch.logical_and(
                    torch.logical_or(mask_lb_2, mask_lb_3),
                    torch.logical_or(mask_ub_2, mask_ub_3)
                ),
                torch.logical_and(
                    torch.logical_or(mask_lb_1, mask_lb_4),
                    torch.logical_or(mask_ub_1, mask_ub_4)
                )
            )
        )
        self.branch_mask = torch.logical_not(self.sigmoid_like_mask)

        self.mask_neg = torch.logical_and(torch.logical_or(mask_lb_3, mask_lb_4),
                                          torch.logical_and(torch.logical_or(mask_ub_3, mask_ub_4),
                                                            self.sigmoid_like_mask))

        self.mask_pos = torch.logical_and(torch.logical_or(mask_lb_1, mask_lb_2),
                                          torch.logical_and(torch.logical_or(mask_ub_1, mask_ub_2),
                                                            self.sigmoid_like_mask))

        self.mask_both = torch.logical_xor(self.sigmoid_like_mask,
                                           torch.logical_or(self.mask_neg, self.mask_pos))

        self.convex_concave = self.d2_act_func(lb) >= 0

    def generate_d_lower_upper(self, lower, upper):
        # Indices of neurons with input upper bound >=0, whose optimal slope to lower bound the function was pre-computed.
        # Note that for neurons with also input lower bound >=0, they will be masked later.
        k_tensor = torch.floor(upper / (2 * torch.pi))
        upper_clamped = upper - k_tensor * (2 * torch.pi)
        case1_mask = torch.logical_and(upper_clamped >= 0, upper_clamped <= torch.pi / 2)
        upper_clamped_new = upper_clamped.clamp(min=0, max=torch.pi / 2)
        index = torch.max(
            torch.zeros(upper.numel(), dtype=torch.long, device=upper.device),
            (upper_clamped_new / self.step_pre).to(torch.long).reshape(-1)
        ) + 1
        # Lookup the lower bound slope from the pre-computed table.
        d_lower = (torch.index_select(self.d_lower, 0, index).view(lower.shape)
                   + k_tensor * 2 * torch.pi) * case1_mask

        case2_mask = torch.logical_and(upper_clamped >= torch.pi, upper_clamped <= 3 * torch.pi / 2)
        upper_clamped_new = upper_clamped.clamp(min=torch.pi, max=3 * torch.pi / 2)
        index = torch.max(
            torch.zeros(upper.numel(), dtype=torch.long, device=upper.device),
            ((torch.pi - upper_clamped_new) / -self.step_pre).to(torch.long).reshape(-1)
        ) + 1
        # Lookup the lower bound slope from the pre-computed table.
        d_upper = (torch.pi - torch.index_select(self.d_upper, 0, index).view(lower.shape)
                   + k_tensor * 2 * torch.pi) * case2_mask

        # Indices of neurons with lower bound <=0, whose optimal slope to upper bound the function was pre-computed.
        k_tensor = torch.floor(lower / (2 * torch.pi))
        lower_clamped = lower - k_tensor * (2 * torch.pi)
        case3_mask = torch.logical_and(lower_clamped >= 3 * torch.pi / 2, lower_clamped <= 2 * torch.pi)
        lower_clamped_new = lower_clamped.clamp(min=(3 * torch.pi / 2), max=2 * torch.pi)
        index = torch.max(
            torch.zeros(lower.numel(), dtype=torch.long, device=lower.device),
            ((lower_clamped_new - 2 * torch.pi) / -self.step_pre).to(torch.long).reshape(-1)
        ) + 1
        d_upper += (torch.index_select(self.d_upper, 0, index).view(upper.shape)
                    + (k_tensor + 1) * 2 * torch.pi) * case3_mask

        case4_mask = torch.logical_and(lower_clamped >= torch.pi / 2, lower_clamped <= torch.pi)
        lower_clamped_new = lower_clamped.clamp(min=(torch.pi / 2), max=3 * torch.pi)
        index = torch.max(
            torch.zeros(lower.numel(), dtype=torch.long, device=lower.device),
            ((torch.pi - lower_clamped_new) / self.step_pre).to(torch.long).reshape(-1)
        ) + 1
        d_lower += (torch.pi - torch.index_select(self.d_lower, 0, index).view(upper.shape)
                    + k_tensor * 2 * torch.pi) * case4_mask
        return d_lower, d_upper

    @staticmethod
    def arcsin(c):
        """Arcsin with gradient fixes.

        arcsin(-1) and arcsin(1) have pathological gradients and should be avoided.
        """
        if c.min() > -1 and c.max() < 1:
            return torch.arcsin(c)
        c_ = c.clone()
        mask_neg = c == -1
        mask_pos = c == 1
        c_[mask_neg] = 0
        c_[mask_pos] = 0
        ret = torch.arcsin(c_)
        ret[mask_neg] = -torch.pi / 2
        ret[mask_pos] = torch.pi / 2
        return ret

    @staticmethod
    def get_intersection(start, end, c, theta=0.):
        """Get the number of intersections between y = sin(x + theta) and y = c between start and end."""
        # Use arcsine to find the first 2 intersections.
        crossing1 = BoundSin.arcsin(c) - theta
        crossing2 = torch.pi - crossing1 - 2 * theta  # Problematic at exact 1/2 pi, but ok in our case (happens only when lb=ub).
        return BoundSin.n_crossing(start, end, crossing1) + BoundSin.n_crossing(start, end, crossing2)

    @staticmethod
    def n_crossing(start, end, s):
        """Check how many times we will encounter value s + k*2*pi within start and end for any integer k."""
        cycles = torch.floor((end - start) / (2 * torch.pi))  # Number of 2pi cycles.
        # Move s and end to the same 2 * pi cycle as start.
        dist = torch.floor((s - start) / (2 * torch.pi))
        real_s = s - dist * 2 * torch.pi
        real_end = end - cycles * 2 * torch.pi
        return (real_s >= start).to(s) * (real_s <= real_end).to(s) + cycles

    @staticmethod
    def check_bound(tangent_point, x):
        """Check whether the tangent line at tangent_point is a valid lower/upper bound for x."""
        # evaluate the value of the tangent line at x and see it is >= 0 or <=0.
        d = BoundSin.d_func(tangent_point)
        val = d * (x - tangent_point) + BoundSin.func(tangent_point)
        # We want a positive margin when finding a lower line, but as close to 0 as possible.
        # We want a negative margin when finding a upper line, but as close to 0 as possible.
        margin = BoundSin.func(x) - val
        return margin

    @staticmethod
    @torch.no_grad()
    def get_lower_left_bound(xl, steps=20):
        """Get a global lower bound given lower bound on x. Return slope and intercept."""
        dtype = xl.dtype
        # Constrain xl into the -0.5 pi to 1.5 pi region.
        cycles = torch.floor((xl + 0.5 * torch.pi) / (2 * torch.pi)) * (2 * torch.pi)
        xl = xl - cycles
        use_tangent_line = (xl >= torch.pi).to(dtype)
        # Case 1: xl > pi, Lower tangent line is the only possible lower bound.
        # Case 2: Binary search needed. Testing from another tangent endpoint in [pi, 1.5*pi]. It must be in this region.
        left = torch.pi * torch.ones_like(xl)
        # The right end guarantees the margin > 0 because it is basically a IBP lower bound (-1).
        right = (1.5 * torch.pi) * torch.ones_like(xl)
        last_right = right.clone()
        for _ in range(steps):
            mid = (left + right) / 2.
            margin = BoundSin.check_bound(mid, xl)
            pos_mask = (margin > 0).to(dtype)  # We want to margin > 0 but at small as possible.
            neg_mask = 1.0 - pos_mask
            right = mid * pos_mask + right * neg_mask  # We have positive margin, reduce right hand side.
            last_right = mid * pos_mask + last_right * neg_mask  # Always sound, since the margin is positive.
            left = mid * neg_mask + left * pos_mask
        d = xl * use_tangent_line + last_right * (1. - use_tangent_line)
        # Return slope and bias.
        return [d, cycles]

    @staticmethod
    @torch.no_grad()
    def get_upper_left_bound(xl, steps=20):
        """Get a global upper bound given lower bound on x. Return slope and intercept."""
        dtype = xl.dtype
        # Constrain xl into the -0.5 pi to 1.5 pi region.
        cycles = torch.floor((xl - 0.5 * torch.pi) / (2 * torch.pi)) * (2 * torch.pi)
        xl = xl - cycles
        use_tangent_line = (xl >= 2.0 * torch.pi).to(dtype)
        # Case 1: xl > pi, Lower tangent line is the only possible lower bound.
        # Case 2: Binary search needed. Testing from another tangent endpoint in [pi, 1.5*pi]. It must be in this region.
        left = (2.0 * torch.pi) * torch.ones_like(xl)
        # The right end guarantees the margin > 0 because it is basically a IBP lower bound (-1).
        right = (2.5 * torch.pi) * torch.ones_like(xl)
        last_right = right.clone()
        for _ in range(steps):
            mid = (left + right) / 2.
            margin = BoundSin.check_bound(mid, xl)
            pos_mask = (margin > 0).to(dtype)  # We want to margin < 0 but at small as possible.
            neg_mask = 1.0 - pos_mask
            right = mid * neg_mask + right * pos_mask  # We have positive margin, reduce right hand side.
            last_right = mid * neg_mask + last_right * pos_mask  # Always sound, since the margin is positive.
            left = mid * pos_mask + left * neg_mask
        d = xl * use_tangent_line + last_right * (1. - use_tangent_line)
        # Return slope and bias.
        return [d, cycles]

    @staticmethod
    @torch.no_grad()
    def get_lower_right_bound(xu, steps=20):
        """Get a global lower bound given upper bound on x. Return slope and intercept."""
        # Constrain xu into the -0.5 pi to 1.5 pi region.
        cycles = torch.floor((xu + 0.5 * torch.pi) / (2 * torch.pi)) * (2 * torch.pi)
        xu = xu - cycles
        d, _ = BoundSin.get_lower_left_bound(torch.pi - xu, steps)
        return [3 * torch.pi - d, cycles - 2 * torch.pi]

    @staticmethod
    @torch.no_grad()
    def get_upper_right_bound(xu, steps=20):
        """Get a global upper bound given upper bound on x. Return slope and intercept."""
        # Constrain xu into the 0.5 pi to 2.5 pi region.
        cycles = torch.floor((xu - 0.5 * torch.pi) / (2 * torch.pi)) * (2 * torch.pi)
        xu = xu - cycles
        d, _ = BoundSin.get_upper_left_bound(3 * torch.pi - xu, steps)
        return [5 * torch.pi - d, cycles - 2 * torch.pi]

    def get_bound_tb(self, lb, ub):
        """Find lower or upper bounds from lookup table."""
        lower, upper = lb, ub
        step = 2 * torch.pi / (BoundSin.n_table_entries - 1)
        # Move to 0 to 2 pi region.
        lb_cycles = torch.floor(lb / (2 * torch.pi)) * (2 * torch.pi)
        lb = torch.clamp(lb - lb_cycles, min=0, max=2 * torch.pi)
        ub_cycles = torch.floor(ub / (2 * torch.pi)) * (2 * torch.pi)
        ub = torch.clamp(ub - ub_cycles, min=0, max=2 * torch.pi)
        # Find the indice within the lookup table from 0 - 2pi.
        indices_lb = lb.div(step).long()
        indices_ub = ub.div(step).long()
        tangent_left_lower = BoundSin.xl_lower_tb[0][indices_lb]
        tangent_left_upper = BoundSin.xl_upper_tb[0][indices_lb]
        tangent_right_lower = BoundSin.xu_lower_tb[0][indices_ub]
        tangent_right_upper = BoundSin.xu_upper_tb[0][indices_ub]
        if self.opt_stage in ['opt', 'reuse']:
            if not hasattr(self, 'alpha'):
                # Raise an error if alpha is not created.
                self._no_bound_parameters()
            ns = self._start

            self.alpha[ns].data[8:10, :] = torch.min(
                torch.max(self.alpha[ns][8:10, :], tangent_left_lower), tangent_right_lower)
            self.alpha[ns].data[10:12, :] = torch.min(
                torch.max(self.alpha[ns][10:12, :], tangent_left_upper), tangent_right_upper)
            tangent_lower = self.alpha[ns][8:10, :]
            tangent_upper = self.alpha[ns][10:12, :]
        else:
            # add cycles to optimizable tangent region
            unfolded_left_lower = (tangent_left_lower +
                BoundSin.xl_lower_tb[1][indices_lb] + lb_cycles)
            left_lower_ends = 1.5*torch.pi + BoundSin.xl_lower_tb[1][indices_lb] + lb_cycles
            unfolded_right_lower = (tangent_right_lower +
                BoundSin.xu_lower_tb[1][indices_ub] + ub_cycles)
            right_lower_ends = 1.5*torch.pi + BoundSin.xu_lower_tb[1][indices_ub] + ub_cycles
            mid = (lower + upper) / 2

            leftmost_mask = torch.logical_and(mid < unfolded_left_lower,
                unfolded_left_lower <= upper)
            left_range_mask = torch.logical_and(mid >= unfolded_left_lower,
                mid < left_lower_ends)
            inbetween_mask = torch.logical_and(mid >= left_lower_ends,
                mid < right_lower_ends)
            rightmost_mask = torch.logical_and(mid >= unfolded_right_lower,
                unfolded_right_lower >= lower)
            right_range_mask = torch.logical_and(~left_range_mask, torch.logical_and(mid >= right_lower_ends,
                mid < unfolded_right_lower))

            tangent_lower = (leftmost_mask * tangent_left_lower +
                left_range_mask * (mid - BoundSin.xl_lower_tb[1][indices_lb] - lb_cycles) +
                inbetween_mask * 1.5*torch.pi + rightmost_mask * tangent_right_lower +
                right_range_mask * (mid - BoundSin.xu_lower_tb[1][indices_ub] - ub_cycles))

            unfolded_left_upper = (tangent_left_upper +
                BoundSin.xl_upper_tb[1][indices_lb] + lb_cycles)
            left_upper_ends = 2.5*torch.pi + BoundSin.xl_upper_tb[1][indices_lb] + lb_cycles
            unfolded_right_upper = (tangent_right_upper +
                BoundSin.xu_upper_tb[1][indices_ub] + ub_cycles)
            right_upper_ends = 2.5*torch.pi + BoundSin.xu_upper_tb[1][indices_ub] + ub_cycles
            mid = (lower + upper) / 2

            leftmost_mask = torch.logical_and(mid < unfolded_left_upper,
                unfolded_left_upper <= upper)
            left_range_mask = torch.logical_and(mid >= unfolded_left_upper,
                mid < left_upper_ends)
            inbetween_mask = torch.logical_and(mid >= left_upper_ends,
                mid < right_upper_ends)
            rightmost_mask = torch.logical_and(mid >= unfolded_right_upper,
                unfolded_right_upper >= lower)
            right_range_mask = torch.logical_and(~left_range_mask, torch.logical_and(mid >= right_upper_ends,
                mid < unfolded_right_upper))

            tangent_upper = (leftmost_mask * tangent_left_upper +
                left_range_mask * (mid - BoundSin.xl_upper_tb[1][indices_lb] - lb_cycles) +
                inbetween_mask * 2.5*torch.pi + rightmost_mask * tangent_right_upper +
                right_range_mask * (mid - BoundSin.xu_upper_tb[1][indices_ub] - ub_cycles))

            if self.opt_stage == 'init':
                ns = self._start
                self.tp_lower_init[ns] = tangent_lower.detach()
                self.tp_upper_init[ns] = tangent_upper.detach()

        d_lower = BoundSin.d_func(tangent_lower)
        b_lower = BoundSin.func(tangent_lower) - d_lower * (tangent_lower +
                    torch.where(tangent_lower <= 1.5*torch.pi,
                        BoundSin.xl_lower_tb[1][indices_lb] + lb_cycles,
                        BoundSin.xu_lower_tb[1][indices_ub] + ub_cycles))
        d_upper = BoundSin.d_func(tangent_upper)
        b_upper = BoundSin.func(tangent_upper) - d_upper * (tangent_upper +
                    torch.where(tangent_upper <= 2.5*torch.pi,
                        BoundSin.xl_upper_tb[1][indices_lb] + lb_cycles,
                        BoundSin.xu_upper_tb[1][indices_ub] + ub_cycles))
        return d_lower, b_lower, d_upper, b_upper

    def forward(self, x):
        return torch.sin(x)

    def interval_propagate(self, *v):
        # Check if a point is in [l, u], considering the 2pi period
        def check_crossing(ll, uu, point):
            return ((((uu - point) / (2 * torch.pi)).floor()
                     - ((ll - point) / (2 * torch.pi)).floor()) > 0).to(h_Ls.dtype)
        h_L, h_U = v[0][0], v[0][1]
        h_Ls, h_Us = self.forward(h_L), self.forward(h_U)
        # If crossing pi/2, then max is fixed 1.0
        max_mask = check_crossing(h_L, h_U, self.ibp_max_point)
        # If crossing pi*3/2, then min is fixed -1.0
        min_mask = check_crossing(h_L, h_U, self.ibp_min_point)
        ub = torch.max(h_Ls, h_Us)
        ub = max_mask + (1 - max_mask) * ub
        lb = torch.min(h_Ls, h_Us)
        lb = - min_mask + (1 - min_mask) * lb
        return lb, ub

    def bound_relax_branch(self, lb, ub):
        dtype = lb.dtype

        ub = torch.max(ub, lb + 1e-8)

        # Case 1: Connect the two points as a line
        sub = self.func(ub)
        slb = self.func(lb)
        mid = (sub + slb) / 2.
        smid = self.func((ub + lb) / 2)
        gap = smid - mid

        case1_line_slope = (sub - slb) / (ub - lb).clamp(min=1e-10)
        case1_line_bias = slb - case1_line_slope * lb
        # Check if there are crossings between the line and the sin function.
        grad_crossings = self.get_intersection(lb, ub, case1_line_slope, theta=0.5 * torch.pi)
        # If there is no crossing, then we can connect the two points together as a lower/upper bound.
        use_line = grad_crossings == 1
        # Connected line is the upper bound.
        upper_use_line = torch.logical_and(gap < 0, use_line)
        # Connected line is the lower bound.
        lower_use_line = torch.logical_and(gap >= 0, use_line)

        # Case 2: we will try the global lower/upper bounds at lb and ub.
        # For the points and lb and ub, we can construct both lower and upper bounds.
        (case_2_lower_slope, case_2_lower_bias,
            case_2_upper_slope, case_2_upper_bias) = self.get_bound_tb(lb, ub)

        # Finally, choose between case 1 and case 2.
        lower_use_line = lower_use_line.to(dtype)
        not_lower_use_line = 1. - lower_use_line
        upper_use_line = upper_use_line.to(dtype)
        not_upper_use_line = 1. - upper_use_line
        lower_slope = lower_use_line * case1_line_slope + not_lower_use_line * case_2_lower_slope
        lower_bias = lower_use_line * case1_line_bias + not_lower_use_line * case_2_lower_bias
        upper_slope = upper_use_line * case1_line_slope + not_upper_use_line * case_2_upper_slope
        upper_bias = upper_use_line * case1_line_bias + not_upper_use_line * case_2_upper_bias
        return lower_slope, lower_bias, upper_slope, upper_bias


class BoundCos(BoundSin):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.ibp_max_point = 0.0
        self.ibp_min_point = torch.pi

    def forward(self, x):
        return torch.cos(x)

    def bound_relax(self, x, init=False, dim_opt=None):
        # Shift the input by half_pi, and shifting the linear bounds back.
        half_pi = 0.5 * torch.pi
        x_shifted = SimpleNamespace()
        x_shifted.lower = x.lower + half_pi
        x_shifted.upper = x.upper + half_pi
        super().bound_relax(x_shifted, init=init, dim_opt=dim_opt)
        self.lb = self.lb + self.lw * half_pi
        self.ub = self.ub + self.uw * half_pi


class BoundSec(BoundActivation):
    def __init__(self, attr=None, inputs=None, output_index=0, options=None):
        super().__init__(attr, inputs, output_index, options)
        self.ibp_intermediate = True

    def forward(self, x):
        return 1. / torch.cos(x)

    def bound_relax(self, x, init=False):
        assert x.lower.min() > -torch.pi / 2
        assert x.upper.max() < torch.pi / 2

        x_L = x.lower
        x_U = x.upper
        y_L = self.forward(x_L)
        y_U = self.forward(x_U)
        mask_close = x_U - x_L < 1e-8
        upper_k = torch.where(
            mask_close,
            y_L * torch.tan(x_L),
            (y_U - y_L) / (x_U - x_L).clamp(min=1e-8)
        )
        self.uw = upper_k
        self.ub = -upper_k * x_L + y_L

        mid = (x_L + x_U) / 2
        y_mid = self.forward(mid)
        lower_k = y_mid * torch.tan(mid)
        self.lw = lower_k
        self.lb = -lower_k * mid + y_mid

    def interval_propagate(self, *v):
        h_L, h_U = v[0][0], v[0][1]
        assert h_L.min() > -torch.pi / 2
        assert h_U.max() < torch.pi / 2
        y_L = self.forward(h_L)
        y_U = self.forward(h_U)
        lower = (h_U < 0) * (y_U - 1) + (h_L > 0) * (y_L - 1) + 1
        upper = torch.max(y_L, y_U)
        return lower, upper


class SinGradOp(Function):
    @staticmethod
    def symbolic(_, x):
        return _.op('grad::Sin', x)

    @staticmethod
    def forward(ctx, input):
        return torch.cos(input)


class CosGradOp(Function):
    @staticmethod
    def symbolic(_, x):
        return _.op('grad::Cos', x)

    @staticmethod
    def forward(ctx, input):
        return -torch.sin(input)


class TanhGradOp(Function):
    @staticmethod
    def symbolic(_, x):
        return _.op('grad::Tanh', x)

    @staticmethod
    def forward(ctx, input):
        return 1 - torch.tanh(input)**2


================================================
FILE: auto_LiRPA/opt_pruner.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
"""Pruning during the optimization."""

import time

import torch


class OptPruner:

    def __init__(self, x, threshold, multi_spec_keep_func, loss_reduction_func,
                 decision_thresh, fix_interm_bounds,
                 epsilon_over_decision_thresh):
        self.x = x
        self.threshold = threshold
        self.multi_spec_keep_func = multi_spec_keep_func
        self.loss_reduction_func = loss_reduction_func
        self.decision_thresh = decision_thresh
        self.fix_interm_bounds = fix_interm_bounds
        self.epsilon_over_decision_thresh = epsilon_over_decision_thresh

        # For computing the positive domain ratio
        self.original_size = x[0].shape[0]
        self.pruning_in_iteration = False
        self.preserve_mask = None
        self.preserve_mask_next = None
        self.time = 0

        # For holding full-sized alphas
        self.cached_alphas = {}

    def prune(self, x, C, ret_l, ret_u, ret, full_l, full_ret_l, full_ret_u,
              full_ret, interm_bounds, aux_reference_bounds, reference_bounds,
              stop_criterion_func, bound_lower):
        # positive domains may already be filtered out, so we use all domains -
        # negative domains to compute
        # FIXME Only using ret_l but not ret_u.
        if self.decision_thresh is not None and ret_l is not None:
            if (isinstance(self.decision_thresh, torch.Tensor)
                    and self.decision_thresh.numel() > 1
                    and self.preserve_mask is not None):
                if self.decision_thresh.shape[-1] == 1:
                    # single spec with pruned domains
                    negative_domain = (
                        ret_l.view(-1)
                        <= self.decision_thresh[self.preserve_mask].view(-1)
                    ).sum()
                else:
                    # multiple spec with pruned domains
                    negative_domain = self.multi_spec_keep_func(
                        ret_l <= self.decision_thresh[self.preserve_mask]).sum()
            else:
                if ret_l.shape[-1] == 1:
                    # single spec
                    negative_domain = (
                        ret_l.view(-1) <= self.decision_thresh.view(-1)).sum()
                else:
                    # multiple spec
                    negative_domain = self.multi_spec_keep_func(
                        ret_l <= self.decision_thresh).sum()
            positive_domain_num = self.original_size - negative_domain
        else:
            positive_domain_num = -1
        positive_domain_ratio = float(
            positive_domain_num) / float(self.original_size)
        # threshold is 10% by default
        self.next_iter_pruning_in_iteration = (
            self.decision_thresh is not None
            and positive_domain_ratio > self.threshold)

        if self.pruning_in_iteration:
            stime = time.time()
            self.get_preserve_mask(ret_l)
            # prune C
            if C is not None and C.shape[0] == x[0].shape[0]:
                C = C[self.now_preserve_mask]  # means C is also batch specific
            # prune x
            x, pre_prune_size = self._prune_x(x)
            # prune bounds
            ret_prune = self._prune_bounds_by_mask(
                ret_l, ret_u, ret,
                interm_bounds, aux_reference_bounds, reference_bounds, pre_prune_size)
            full_l, full_ret_l, full_ret_u, full_ret = ret_prune
            self.time += time.time() - stime

        stop_criterion = stop_criterion_func(
            full_ret_l) if bound_lower else stop_criterion_func(-full_ret_u)
        if (type(stop_criterion) != bool and stop_criterion.numel() > 1
                and self.pruning_in_iteration):
            stop_criterion = stop_criterion[self.preserve_mask]

        return (x, C, full_l, full_ret_l, full_ret_u,
                full_ret, stop_criterion)

    def prune_idx(self, idx_mask, idx, x):
        if self.pruning_in_iteration:
            # local sparse index of preserved samples where
            # idx == true
            local_idx = idx_mask[self.preserve_mask].nonzero().view(-1)
            # idx is global sparse index of preserved samples where
            # idx == true
            new_idx = torch.zeros_like(
                idx_mask, dtype=torch.bool, device=x[0].device)
            new_idx[self.preserve_mask] = idx_mask[self.preserve_mask]
            idx = new_idx.nonzero().view(-1)
            reference_idx = local_idx
        else:
            reference_idx = idx
        return reference_idx, idx

    def next_iter(self):
        if self.pruning_in_iteration:
            self.preserve_mask = self.preserve_mask_next
        if (not self.pruning_in_iteration
                and self.next_iter_pruning_in_iteration):
            # init preserve_mask etc
            self.preserve_mask = torch.arange(
                0, self.x[0].shape[0], device=self.x[0].device, dtype=torch.long)
            self.pruning_in_iteration = True

    def update_best(self, full_ret_l, full_ret_u, best_ret):
        if self.pruning_in_iteration:
            # overwrite pruned cells in best_ret by threshold + eps
            fin_l, fin_u = best_ret
            if fin_l is not None:
                new_fin_l = full_ret_l
                new_fin_l[self.preserve_mask] = fin_l[self.preserve_mask]
                fin_l = new_fin_l
            if fin_u is not None:
                new_fin_u = full_ret_u
                new_fin_u[self.preserve_mask] = fin_u[self.preserve_mask]
                fin_u = new_fin_u
            best_ret = (fin_l, fin_u)
        return best_ret

    def update_ratio(self, full_l, full_ret_l):
        if self.decision_thresh is not None and full_l.numel() > 0:
            stime = time.time()
            with torch.no_grad():
                if isinstance(self.decision_thresh, torch.Tensor):
                    if self.decision_thresh.shape[-1] == 1:
                        neg_domain_num = torch.sum(
                            full_ret_l.view(-1) <= self.decision_thresh.view(-1)
                        ).item()
                    else:
                        neg_domain_num = torch.sum(self.multi_spec_keep_func(
                            full_ret_l <= self.decision_thresh)).item()
                else:
                    if full_l.shape[-1] == 1:
                        neg_domain_num = torch.sum(
                            full_ret_l.view(-1) <= self.decision_thresh).item()
                    else:
                        neg_domain_num = torch.sum(self.multi_spec_keep_func(
                            full_ret_l <= self.decision_thresh)).item()
                now_pruning_ratio = (
                    1.0 - float(neg_domain_num) / float(full_l.shape[0]))
                print('pruning_in_iteration open status:',
                      self.pruning_in_iteration)
                print('ratio of positive domain =',
                    full_l.shape[0] - neg_domain_num,
                    '/', full_l.numel(), '=', now_pruning_ratio)
            self.time += time.time() - stime
            print('pruning-in-iteration extra time:', self.time)

    @torch.no_grad()
    def _prune_x(self, x):
        """
        Prune x by given now_preserve_mask.
        """
        x = list(x)
        pre_prune_size = x[0].shape[0]
        x[0].data = x[0][self.now_preserve_mask].data
        if hasattr(x[0], 'ptb'):
            if x[0].ptb.x_L is not None:
                x[0].ptb.x_L = x[0].ptb.x_L[self.now_preserve_mask]
            if x[0].ptb.x_U is not None:
                x[0].ptb.x_U = x[0].ptb.x_U[self.now_preserve_mask]
        x = tuple(x)

        return x, pre_prune_size

    def _prune_dict_of_lists(self, dict_of_lists, pre_prune_size):
        if dict_of_lists is not None:
            for k, v in dict_of_lists.items():
                v_l, v_r = v[0], v[1]
                if v_l.shape[0] == pre_prune_size:
                    # the first dim is batch size and matches the preserve mask
                    v_l = v_l[self.now_preserve_mask]
                if v_r.shape[0] == pre_prune_size:
                    # the first dim is batch size and matches the preserve mask
                    v_r = v_r[self.now_preserve_mask]
                dict_of_lists[k] = [v_l, v_r]

    @torch.no_grad()
    def _prune_bounds_by_mask(self, ret_l, ret_u, ret, interm_bounds,
                              aux_reference_bounds, reference_bounds, pre_prune_size):
        """
        Prune bounds by given now_preserve_mask.
        """
        full_ret_l, full_l = self._recover_bounds_to_full_batch(ret_l)
        full_ret_u, full_u = self._recover_bounds_to_full_batch(ret_u)

        full_ret = (full_ret_l, full_ret_u) + ret[2:]

        if self.fix_interm_bounds:
            interval_to_prune = interm_bounds
        else:
            interval_to_prune = None

        self._prune_dict_of_lists(interval_to_prune, pre_prune_size)
        self._prune_dict_of_lists(aux_reference_bounds, pre_prune_size)
        self._prune_dict_of_lists(reference_bounds, pre_prune_size)

        # update the global mask here for possible next iteration
        self.preserve_mask_next = self.preserve_mask[self.now_preserve_mask]

        return full_l, full_ret_l, full_ret_u, full_ret

    @torch.no_grad()
    def get_preserve_mask(self, ret_l):
        """
        Get preserve mask by decision_thresh to filter out the satisfied bounds.
        """
        if (isinstance(self.decision_thresh, torch.Tensor)
                and self.decision_thresh.numel() > 1):
            if self.decision_thresh.shape[-1] == 1:
                self.now_preserve_mask = (
                    ret_l <= self.decision_thresh[self.preserve_mask]
                ).view(-1).nonzero().view(-1)
            else:
                self.now_preserve_mask = self.multi_spec_keep_func(
                    ret_l <= self.decision_thresh[self.preserve_mask]
                ).nonzero().view(-1)
        else:
            if self.decision_thresh.shape[-1] == 1:
                self.now_preserve_mask = (
                    ret_l <= self.decision_thresh).view(-1).nonzero().view(-1)
            else:
                self.now_preserve_mask = self.multi_spec_keep_func(
                    ret_l <= self.decision_thresh).nonzero().view(-1)

    def _recover_bounds_to_full_batch(self, ret):
        """
        Recover lower and upper bounds to full batch size so that later we can
        directly update using the full batch size of l and u.
        """
        if ret is not None:
            if (isinstance(self.decision_thresh, torch.Tensor)
                    and self.decision_thresh.numel() > 1):
                full_ret = (
                    self.decision_thresh.clone().to(ret.device).type(ret.dtype)
                    + self.epsilon_over_decision_thresh)
            else:
                num_decision_thresh = self.decision_thresh
                if isinstance(num_decision_thresh, torch.Tensor):
                    num_decision_thresh = num_decision_thresh.item()
                full_ret = torch.full(
                    (self.original_size,) + tuple(ret.shape[1:]),
                    fill_value=(num_decision_thresh
                                + self.epsilon_over_decision_thresh),
                    device=ret.device, dtype=ret.dtype)
            full_ret[self.preserve_mask] = ret
            if full_ret.shape[1] > 1:
                full_reduced_ret = self.loss_reduction_func(full_ret)
            else:
                full_reduced_ret = full_ret
        else:
            full_ret = full_reduced_ret = None

        return full_ret, full_reduced_ret

    def cache_full_sized_alpha(self, optimizable_activations: list):
        """
        When preserve mask is in use, cache the full-sized alphas in self.cached_alphas,
        and rewrite the alphas in nodes according to the preserve mask.
        The full-sized alphas will be recovered back to nodes after compute_bounds,
        via the function named recover_full_sized_alphas()
        :param optimizable_activations: list of nodes that may have slope alphas as optimizable variables
        :return: None
        """
        if self.pruning_in_iteration:
            for act in optimizable_activations:
                if act.name in self.cached_alphas:
                    self.cached_alphas[act.name].clear()
                self.cached_alphas[act.name] = {}
                if act.alpha is not None:
                    for start_node in act.alpha:
                        # cached alphas and alphas stored in nodes should share the same memory space
                        self.cached_alphas[act.name][start_node] = act.alpha[start_node]
                        act.alpha[start_node] = act.alpha[start_node][:, :, self.preserve_mask]

    def recover_full_sized_alpha(self, optimizable_activations: list):
        """
        After bound computation, recover the full-sized alphas back to nodes.
        :param optimizable_activations: ist of nodes that may have slope alphas as optimizable variables
        :return: None
        """
        if self.pruning_in_iteration:
            for act in optimizable_activations:
                for start_node in self.cached_alphas[act.name]:
                    act.alpha[start_node] = self.cached_alphas[act.name][start_node]

    def clean_full_sized_alpha_cache(self):
        for act_node in self.cached_alphas:
            self.cached_alphas[act_node].clear()
        self.cached_alphas.clear()


================================================
FILE: auto_LiRPA/optimize_graph.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
"""Optimize the graph to merge nodes and remove unnecessary ones.

Initial and experimental code only.
"""

from auto_LiRPA.bound_ops import *
from auto_LiRPA.utils import logger
import torch

from typing import TYPE_CHECKING
if TYPE_CHECKING:
    from .bound_general import BoundedModule


def _optimize_graph(self: 'BoundedModule'):
    """Optimize the graph to remove some unnecessary nodes."""
    merge_identical_act(self)
    convert_sqr(self)
    div_to_mul(self)
    merge_sec(self)
    minmax_to_relu(self)
    optimize_relu_relation(self)

    if self.bound_opts['optimize_graph']['optimizer'] is not None:
        # Use the custom graph optimizer
        self.bound_opts['optimize_graph']['optimizer'](self)

    for node in list(self.nodes()):
        if (not node.output_name
                and node.name != self.final_name
                and node.name not in self.root_names):
            self.delete_node(node)


def _copy_node_properties(new, ref):
    new.output_shape = ref.output_shape
    new.device = ref.device
    new.attr['device'] = ref.attr['device']
    new.batch_dim = ref.batch_dim
    new.from_complex_node = ref.from_complex_node


def merge_sec(model: 'BoundedModule'):
    nodes = list(model.nodes())
    for node in nodes:
        if type(node) == BoundReciprocal and type(node.inputs[0]) == BoundCos:
            node_new = BoundSec(inputs=[node.inputs[0].inputs[0]])
            node_new.name = f'{node.inputs[0].name}/sec'
            _copy_node_properties(node_new, node)
            if node_new.name in model._modules:
                node_existing = model._modules[node_new.name]
                assert isinstance(node_existing, BoundSec)
                assert node_existing.inputs[0] == node.inputs[0].inputs[0]
                model.replace_node(node, node_existing)
            else:
                model.add_nodes([node_new])
                model.replace_node(node, node_new)


def div_to_mul(model: 'BoundedModule'):
    nodes = list(model.nodes())
    for node in nodes:
        if type(node) == BoundDiv:
            logger.debug('Replacing BoundDiv node: %s', node)
            node_reciprocal = BoundReciprocal(inputs=[node.inputs[1]])
            node_reciprocal.name = f'{node.name}/reciprocal'
            # Properties of the reciprocal node only depend on inputs[1], i.e.
            # the node of denominator. They can be different from those of
            # the original BoundDiv node, due to possible broadcasting and
            # perturbed/unperturbed switching in multiplication.
            _copy_node_properties(node_reciprocal, node.inputs[1])
            model.add_nodes([node_reciprocal])
            node_mul = BoundMul(inputs=[node.inputs[0], node_reciprocal],
                                options=model.bound_opts)
            node_mul.name = f'{node.name}/mul'
            _copy_node_properties(node_mul, node)
            model.add_nodes([node_mul])
            model.replace_node(node, node_mul)


def convert_sqr(model: 'BoundedModule'):
    """Replace BoundMul or Bound Pow with BoundSqr if applicable.

    1. If the two inputs nodes of a BoundMul node are the same, use BoundSqr.
    2. Pow(x, 2) can be replaced with BoundSqr.
    """
    nodes = list(model.nodes())
    for node in nodes:
        replace = False
        if type(node) == BoundMul and node.inputs[0] == node.inputs[1]:
            replace = True
        elif type(node) == BoundPow:
            if ((isinstance(node.inputs[1], BoundBuffers) and node.inputs[1].buffer == 2) or
                (isinstance(node.inputs[1], BoundConstant) and node.inputs[1].value == 2)):
                replace = True
        if replace:
            node_new = BoundSqr(inputs=[node.inputs[0]])
            node_new.name = f'{node.name}/sqr'
            _copy_node_properties(node_new, node)
            model.add_nodes([node_new])
            logger.debug('Replaceing %s with %s', node, node_new)
            model.replace_node(node, node_new)


def merge_identical_act(model: 'BoundedModule'):
    """Merge identical BoundActivation"""
    nodes = list(model.nodes())
    merged = [False] * len(nodes)
    for i in range(len(nodes)):
        if (not merged[i]
                and isinstance(nodes[i], BoundActivation)
                and len(nodes[i].inputs) == 1):
            for j in range(i + 1, len(nodes)):
                if (not merged[j]
                        and type(nodes[j]) == type(nodes[i])
                        and len(nodes[i].inputs) == 1):
                    if nodes[i].inputs[0] == nodes[j].inputs[0]:
                        logger.debug('Merging node %s to %s', nodes[j], nodes[i])
                        model.replace_node(nodes[j], nodes[i])
                        merged[j] = True


def minmax_to_relu(model: 'BoundedModule'):
    """Replace BoundMinMax with BoundRelu if one of its inputs is constant"""
    nodes = list(model.nodes())
    for node in nodes:
        if type(node) == BoundMax:
            for i, input_node in enumerate(node.inputs):
                if not input_node.perturbed:
                    logger.debug('Replacing BoundMax node %s', node)
                    # max(x, c) = ReLU(x - c) + c
                    node_sub = BoundSub(inputs=[node.inputs[1-i], input_node],
                                        options=model.bound_opts)
                    node_sub.name = f'{node.name}/sub'
                    _copy_node_properties(node_sub, node)
                    node_relu = BoundRelu(inputs=[node_sub],
                                          options=model.bound_opts)
                    node_relu.name = f'{node.name}/relu'
                    _copy_node_properties(node_relu, node)
                    node_add = BoundAdd(inputs=[node_relu, input_node],
                                        options=model.bound_opts)
                    node_add.name = f'{node.name}/add'
                    _copy_node_properties(node_add, node)
                    model.add_nodes([node_sub, node_relu, node_add])
                    model.replace_node(node, node_add)
                    break
        elif type(node) == BoundMin:
            for i, input_node in enumerate(node.inputs):
                if not input_node.perturbed:
                    logger.debug('Replacing BoundMin node %s', node)
                    # min(x, c) = -ReLU(c - x) + c
                    node_sub_1 = BoundSub(inputs=[input_node, node.inputs[1-i]],
                                          options=model.bound_opts)
                    node_sub_1.name = f'{node.name}/sub/1'
                    _copy_node_properties(node_sub_1, node)
                    node_relu = BoundRelu(inputs=[node_sub_1],
                                          options=model.bound_opts)
                    node_relu.name = f'{node.name}/relu'
                    _copy_node_properties(node_relu, node)
                    node_sub_2 = BoundSub(inputs=[input_node, node_relu],
                                          options=model.bound_opts)
                    node_sub_2.name = f'{node.name}/sub/2'
                    _copy_node_properties(node_sub_2, node)
                    model.add_nodes([node_sub_1, node_relu, node_sub_2])
                    model.replace_node(node, node_sub_2)
                    break

def _pair_row(Ws, bs, Wm, j, atol=1e-8):
    """
    Checks the relation ReLU(x) - ReLU(-x) = x. Return
    the index at the merge weight if the relation exists,
    otherwise return None.
    """
    # Check whether this fits the pattern in docstring.
    if not (torch.allclose(Ws[j+1], -Ws[j], atol=atol)
            and abs(float(bs[j] + bs[j+1])) < atol):
        return None

    # Make merge weight 4D so Gemm and Conv share same indexing
    if Wm.dim() == 2:                 # Gemm path
        Wm4 = Wm.unsqueeze(-1).unsqueeze(-1)    
    else:                             # Conv path 
        Wm4 = Wm

    # Find corresponding columns of the merge weight
    # We check 1) The two nonzero element are in the same row
    #          2) The two entries are +1 and -1
    # If the check pass, we return the row index, otherwise it 
    # is not a valid pattern match and we return None.
    rows = torch.nonzero(Wm4[:, [j, j+1], 0, 0], as_tuple=False)
    if rows.size(0) != 2 or rows[0, 0] != rows[1, 0]:
        return None
    r = int(rows[0, 0])

    ok = (abs(float(Wm4[r, j, 0, 0] - 1)) < atol and
          abs(float(Wm4[r, j+1, 0, 0] + 1)) < atol and
          torch.count_nonzero(Wm4[r]) == 2)
    return r if ok else None
                
def optimize_relu_relation(model: 'BoundedModule'):
    """
    This graph optimization detects the optimizable path with
    the identity
        ReLU(ReLU(x + b) - ReLu(-x - b)) = ReLU(x + b)
    for both linear layer and convolution layer. Replace the 
    sequence of nodes with pattern
        Gemm -> ReLU -> Gemm -> ReLU or
        Conv -> ReLU -> Conv -> ReLU
    to one single Gemm -> ReLU or Conv -> ReLU.
    """
    nodes = list(model.nodes())
    i = 0
    while i + 3 < len(nodes):
        A, B, C, D = nodes[i:i+4]
        
        # In Conv layers, we detect whether the optimization can be done
        # for pairs of channels. If so, the optimization eliminates one
        # Conv layer and recover the original results with the identity 
        # in docstring.
        if (isinstance(A, BoundConv) and isinstance(B, BoundRelu) and
            isinstance(C, BoundConv) and isinstance(D, BoundRelu) and tuple(C.attr['kernel_shape'])==(1,1)):
            
            # Here use forward() to extract weights to handle BoundParam/BoundConstant, or any other node
            # that could represent weights a unified interface.
            Ws = C.inputs[1].forward()
            Wc = A.inputs[1].forward()
            
            # We only care about 2D conv
            if Ws.ndim != 4 or Wc.ndim != 4:
                i += 1
                continue
            
            bs = C.inputs[2].forward() if C.has_bias else torch.zeros_like(Ws[:, 0, 0, 0])
            bc = A.inputs[2].forward() if A.has_bias else torch.zeros_like(Wc[:, 0, 0, 0])
            
            # Detect whether and where the identity presents in the weight matrix.
            pairs, skip = {}, set()
            for j in range(0, Wc.size(0) - 1):
                r = _pair_row(Wc, bc, Ws, j)
                if r is not None:
                    pairs[j] = r
                    skip.add(j + 1)
            
            if pairs:
                Cout, Cin, kH, kW = Ws.size(0), Wc.size(1), *Wc.shape[2:]
                W_new = torch.empty((Cout, Cin, kH, kW), dtype=Wc.dtype, device=Wc.device)
                b_new = torch.empty((Cout,), dtype=bc.dtype, device=bc.device)

                
                # Build fused weight and bias
                dst = 0
                for src in range(Wc.size(0)):
                    if src in skip:
                        continue
                    b_new[dst] = bs[pairs[src]] + bc[src] if src in pairs else bc[src]
                    W_new[dst] = Wc[src]
                    dst += 1
                
                # Modify the graph using the newly built weights and bias
                weight_node = BoundParams('fused_weight', torch.nn.Parameter(W_new))
                bias_node = BoundParams('fused_bias', torch.nn.Parameter(b_new))
                weight_node.name = f'{A.name}/optimized/weight' 
                bias_node.name = f'{A.name}/optimized/bias'
                
                fused = BoundConv(
                    attr=A.attr.copy(),
                    inputs=[A.inputs[0], weight_node, bias_node],
                    output_index=A.output_index,
                    options=model.bound_opts
                )
                fused.name = f'{A.name}/optimized'
                _copy_node_properties(fused, A)
                relu = BoundRelu(inputs=[fused], options=model.bound_opts)
                relu.name = f'{A.name}/optimized/relu'
                _copy_node_properties(relu, D)
                
                model.add_nodes([weight_node, bias_node, fused, relu])
                model.replace_node(D, relu)
                model.replace_node(A, fused)
                model.delete_node(B)
                model.delete_node(C) 
                
                # Skip the full sequence once the pattern is detected
                i += 4
                continue
        
        # In Linear layer, we detect whether the optimization can be 
        # done for pair of rows. The code structure is similar the 
        # one at Conv branch. 
        elif (isinstance(A, BoundLinear) and isinstance(B, BoundRelu) and
            isinstance(C, BoundLinear) and isinstance(D, BoundRelu)):
            
            Ws = A.inputs[1].forward()
            Wm = C.inputs[1].forward()
            bs = A.inputs[2].forward() if len(A.inputs) == 3 else torch.zeros_like(Ws[:, 0])
            bm = C.inputs[2].forward() if len(C.inputs) == 3 else torch.zeros_like(Wm[:, 0])
            
            pairs, skip = {}, set()
            for j in range(0, Ws.size(0) - 1):
                r = _pair_row(Ws, bs, Wm, j)
                if r is not None:
                    pairs[j] = r
                    skip.add(j + 1)
                 
            if pairs:
                n_out = Wm.shape[0]
                W_new = torch.empty((n_out, Ws.shape[1]), dtype=Ws.dtype, device=A.attr['device'])
                b_new = torch.empty((n_out,), dtype=bs.dtype, device=A.attr['device'])

                dst = 0
                for src in range(Ws.size(0)):
                    if src in skip:
                        continue
                    b_new[dst] = bm[pairs[src]] + bs[src] if src in pairs else bs[src]
                    W_new[dst] = Ws[src]
                    dst += 1
                
                weight_node = BoundParams('fused_weight', torch.nn.Parameter(W_new), attr=dict(device=A.attr['device']))
                bias_node = BoundParams('fused_bias', torch.nn.Parameter(b_new), attr=dict(device=A.attr['device']))
                weight_node.name = f'{A.name}/optimized/weight'
                bias_node.name = f'{A.name}/optimized/bias'
                
                fused = BoundLinear(
                    attr=A.attr.copy(),
                    inputs=[A.inputs[0], weight_node, bias_node],
                    output_index=A.output_index,
                    options=model.bound_opts
                )
                fused.name = f'{A.name}/optimized'
                _copy_node_properties(fused, A)
                relu = BoundRelu(inputs=[fused], options=model.bound_opts)
                relu.name = f'{A.name}/optimized/relu'
                _copy_node_properties(relu, D)
                
                model.add_nodes([weight_node, bias_node, fused, relu])
                model.replace_node(D, relu)
                model.delete_node(A)
                model.delete_node(B)
                model.delete_node(C)
                
                i += 4
                continue
        i += 1


================================================
FILE: auto_LiRPA/optimized_bounds.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
import time
import os
from collections import OrderedDict
from contextlib import ExitStack

import torch
from torch import optim, Tensor
from .beta_crown import print_optimized_beta
from .cuda_utils import double2float
from .utils import reduction_sum, multi_spec_keep_func_all, clone_sub_A_dict
from .opt_pruner import OptPruner
from .perturbations import PerturbationLpNorm

from typing import TYPE_CHECKING, Union, Tuple, Optional, Dict
if TYPE_CHECKING:
    from .bound_general import BoundedModule


default_optimize_bound_args = {
    'enable_alpha_crown': True,  # Enable optimization of alpha.
    'enable_beta_crown': False,  # Enable beta split constraint.

    'apply_output_constraints_to': [],  # Enable optimization w.r.t. output constraints.
    'tighten_input_bounds': False,  # Don't tighten input bounds
    # If output constraints are activated, use only bounds computed with them.
    'best_of_oc_and_no_oc': False,
    'directly_optimize': [],  # No layer should be directly optimized
    'oc_lr': 0.1,  # learning rate for dualized output constraints
    'share_gammas': False,

    'iteration': 20,  # Number of alpha/beta optimization iterations.
    # Share some alpha variables to save memory at the cost of slightly
    # looser bounds.
    'use_shared_alpha': False,
    # Optimizer used for alpha and beta optimization.
    'optimizer': 'adam',
    # Save best results of alpha/beta/bounds during optimization.
    'keep_best': True,
    # Only optimize bounds of last layer during alpha/beta CROWN.
    'fix_interm_bounds': True,
    # Learning rate for the optimizable parameter alpha in alpha-CROWN.
    'lr_alpha': 0.5,
    # Learning rate for the optimizable parameter beta in beta-CROWN.
    'lr_beta': 0.05,
    'lr_cut_beta': 5e-3,  # Learning rate for optimizing cut betas.
    # Initial alpha variables by calling CROWN once.
    'init_alpha': True,
    'lr_coeffs': 0.01,  # Learning rate for coeffs for refinement
    # Layers to be refined, separated by commas.
    # -1 means preactivation before last activation.
    'intermediate_refinement_layers': [-1],
    # When batch size is not 1, this reduction function is applied to
    # reduce the bounds into a scalar.
    'loss_reduction_func': reduction_sum,
    # Criteria function of early stop.
    'stop_criterion_func': lambda x: False,
    # Learning rate decay factor during bounds optimization.
    'lr_decay': 0.98,
    # Number of iterations that we will start considering early stop
    # if tracking no improvement.
    'early_stop_patience': 10,
    # Start to save optimized best bounds
    # when current_iteration > int(iteration*start_save_best)
    'start_save_best': 0.5,
    # Use double fp (float64) at the last iteration in alpha/beta CROWN.
    'use_float64_in_last_iteration': False,
    # Prune verified domain within iteration.
    'pruning_in_iteration': False,
    # Percentage of the minimum domains that can apply pruning.
    'pruning_in_iteration_threshold': 0.2,
    # For specification that will output multiple bounds for one
    # property, we use this function to prune them.
    'multi_spec_keep_func': multi_spec_keep_func_all,
    # Use the newly fixed loss function. By default, it is set to False
    # for compatibility with existing use cases.
    # Try to ensure that the parameters always match with the optimized bounds.
    'deterministic': False,
    'max_time': 1e9,
}


def opt_reuse(self: 'BoundedModule'):
    for node in self.get_enabled_opt_act():
        node.opt_reuse()


def opt_no_reuse(self: 'BoundedModule'):
    for node in self.get_enabled_opt_act():
        node.opt_no_reuse()


def _set_alpha(optimizable_activations, parameters, alphas, lr):
    """Set best_alphas, alphas and parameters list."""
    for node in optimizable_activations:
        alphas.extend(list(node.alpha.values()))
        node.opt_start()
    # Alpha has shape (2, output_shape, batch_dim, node_shape)
    parameters.append({'params': alphas, 'lr': lr, 'batch_dim': 2})
    # best_alpha is a dictionary of dictionary. Each key is the alpha variable
    # for one activation layer, and each value is a dictionary contains all
    # activation layers after that layer as keys.
    best_alphas = OrderedDict()
    for m in optimizable_activations:
        best_alphas[m.name] = {}
        for alpha_m in m.alpha:
            best_alphas[m.name][alpha_m] = m.alpha[alpha_m].detach().clone()
            # We will directly replace the dictionary for each activation layer after
            # optimization, so the saved alpha might not have require_grad=True.
            m.alpha[alpha_m].requires_grad_()

    return best_alphas


def _set_gammas(nodes, parameters):
    """
    Adds gammas to parameters list
    """
    gammas = []
    gamma_lr = 0.1
    for node in nodes:
        if hasattr(node, 'gammas'):
            gammas.append(node.gammas_underlying_tensor)
            # The learning rate is the same for all layers
            gamma_lr = node.options['optimize_bound_args']['oc_lr']
    parameters.append({'params': gammas, 'lr': gamma_lr})


def _save_ret_first_time(bounds, best_ret):
    """Save results at the first iteration to best_ret."""
    if bounds is not None:
        best_ret.append(bounds.detach().clone())
    else:
        best_ret.append(None)


def _to_float64(self: 'BoundedModule', C, x, aux_reference_bounds, interm_bounds):
    """
    Transfer variables to float64 only in the last iteration to help alleviate
    floating point error.
    """
    self.to(torch.float64)
    C = C.to(torch.float64)
    x = self._to(x, torch.float64)
    # best_intermediate_bounds is linked to aux_reference_bounds!
    # we only need call .to() for one of them
    self._to(aux_reference_bounds, torch.float64, inplace=True)
    interm_bounds = self._to(
        interm_bounds, torch.float64)

    return C, x, interm_bounds


def _to_default_dtype(self: 'BoundedModule', x, total_loss, full_ret, ret,
                      best_intermediate_bounds, return_A):
    """
    Switch back to default precision from float64 typically to adapt to
    afterwards operations.
    """
    total_loss = total_loss.to(torch.get_default_dtype())
    self.to(torch.get_default_dtype())
    x[0].to(torch.get_default_dtype())
    full_ret = list(full_ret)
    if isinstance(ret[0], torch.Tensor):
        # round down lower bound
        full_ret[0] = double2float(full_ret[0], 'down')
    if isinstance(ret[1], torch.Tensor):
        # round up upper bound
        full_ret[1] = double2float(full_ret[1], 'up')
    for _k, _v in best_intermediate_bounds.items():
        _v[0] = double2float(_v[0], 'down')
        _v[1] = double2float(_v[1], 'up')
        best_intermediate_bounds[_k] = _v
    if return_A:
        full_ret[2] = self._to(full_ret[2], torch.get_default_dtype())

    return total_loss, x, full_ret


def _get_idx_mask(idx: int, full_ret_bound: Tensor, best_ret_bound: Tensor, loss_reduction_func
                  ) -> Tuple[Tensor, Optional[Tensor]]:
    """
    Get index for improved elements.
    :param idx:                 0 := updating the lower bound, 1 := updating the upper bound
    :param full_ret_bound:      Lower/upper bound results for this iteration
    :param best_ret_bound:      The best lower/upper bound results seen thus far
    :param loss_reduction_func: Loss reduction function that reduces the losses to just the batch
                                dimension.
    :return:
            idx_mask:           A mask on the batch dimension where the mask is true if a
                                sub-problem has seen loss improvement.
            improved_idx:       A Tensor of the indices in the batch dimension that have seen loss
                                improvement.
    """
    assert idx in (0, 1), 'idx must be 0 (lower bound) or 1 (upper bound)'
    reduced_full = loss_reduction_func(full_ret_bound)
    reduced_best = loss_reduction_func(best_ret_bound)
    idx_mask = (reduced_full > reduced_best) if idx == 0 else (reduced_full < reduced_best)
    idx_mask = idx_mask.view(-1)

    improved_idx = idx_mask.nonzero(as_tuple=True)[0] if idx_mask.any() else None
    return idx_mask, improved_idx


def _update_best_ret(
    full_ret: Dict[str, Dict[str, Dict[str, Union[Tensor, 'Patches', Tuple]]]],
    best_ret: Dict[str, Dict[str, Dict[str, Union[Tensor, 'Patches', Tuple]]]],
    loss_reduction_func,
    idx: int,
    deterministic: bool = False,
    best_out_in_A_dict: Optional[Dict[str, Union[Tensor, 'Patches', Tuple]]] = None,
    out_in_keys: Optional[Tuple[str, str]] = None
):
    """
    Update best_ret_bound and best_ret by comparing with new results.
    :param full_ret:                The full return from the 'compute_bounds' method.
    :param best_ret:                The best return during optimization in the same format as
                                    'full_ret'
    :param loss_reduction_func:     Loss reduction function that reduces the losses to just the
                                    batch dimension.
    :param idx:                     0 := updating the lower bound, 1 := updating the upper bound
    :param deterministic:           If true, problems that have seen loss improvement will have
                                    their bounds directly saved as the new best bound. Otherwise,
                                    the current bounds will be compared to the current best bounds
                                    and the comparison result is saved as the new best bound. In
                                    other words, deterministic is true if an improvement in the
                                    loss function is a sufficient condition for bound improvement.
    :param best_out_in_A_dict:      If given, this is the A_dict entry corresponding to the output
    :param out_in_keys:             If given, this is a tuple whose first element is the first index
                                    into the A_dict and whose second element is the second index
                                    into the A_dict. In particular, the first element should be the
                                    name of the output layer of the network, and the second
                                    element should be the name of the input layer. If these indices
                                    are not given correctly, an indexing error will be thrown. If
                                    given, it is assumed that we should use these keys to update
                                    lA/uA/lbias/ubias depending on if the bounds have improved.
                                    Therefore, we must assert that 'full_ret' and 'best_ret' contain
                                    an A_dict.
    :return:
            best_ret:
            best_out_in_A_dict:     An updated A_dict entry corresponding to the output/input layer
            need_update:            Set to True in this method if at least one sub-problem has seen
                                    bound improvement.
            idx_mask:               A mask on the batch dimension where the mask is true if a
                                    sub-problem has seen loss improvement.
            improved_idx:           A Tensor of the indices in the batch dimension that have seen
                                    loss improvement.
    """
    assert idx in (0, 1), 'idx must be 0 (lower bound) or 1 (upper bound)'

    idx_mask, improved_idx = _get_idx_mask(idx, full_ret[idx], best_ret[idx], loss_reduction_func)
    if improved_idx is None:
        return best_ret, best_out_in_A_dict, False, idx_mask, None

    compare_fn = torch.max if idx == 0 else torch.min
    # Update detailed return tensors (if present)
    if full_ret[idx] is not None:
        if deterministic:
            best_ret[idx][improved_idx] = full_ret[idx][improved_idx]
            if out_in_keys is not None:
                _update_A_dict(
                    best_out_in_A_dict,
                    full_ret[2][out_in_keys[0]][out_in_keys[1]],
                    improved_idx
                )
        else:
            if out_in_keys is not None:
                # Since we must also update the A_dict, we don't want to use the original
                # 'compare' method as we need to know which specific problems have
                # seen improvement.
                cmp_op = (lambda x, y: (x > y)) if idx==0 else (lambda x, y: (x < y))
                c_mask = cmp_op(full_ret[idx][improved_idx], best_ret[idx][improved_idx])
                best_ret[idx][improved_idx] = torch.where(
                    c_mask, full_ret[idx][improved_idx], best_ret[idx][improved_idx])
                # Also update the lA/uA/lbias/ubias matrices/vectors from the output layer to
                # the input layer if the bounds have improved and if the output and input layer
                # keys were specified
                _update_A_dict(
                    best_out_in_A_dict,
                    full_ret[2][out_in_keys[0]][out_in_keys[1]],
                    improved_idx, c_mask
                )
            else:
                # Simple tensor-wise comparison (no A_dict)
                best_ret[idx][improved_idx] = compare_fn(
                    full_ret[idx][improved_idx],
                    best_ret[idx][improved_idx])

    return best_ret, best_out_in_A_dict, True, idx_mask, improved_idx


def _update_A_dict(best_A, full_A, improved_idx, c_mask: Optional[Tensor] = None):
    """
    Update best_A dict by full_A for entries at improved_idx.
    :param best_A:         The A_dict entry to be updated.
    :param full_A:         The A_dict entry containing the new values.
    :param improved_idx:   The indices in the batch dimension that have seen bound improvement.
    :param c_mask:         A mask on the batch dimension where the mask is true if a
                            sub-problem has seen bound improvement. If None, then the entire
                            slice at improved_idx will be replaced.
    """
    for key, val in full_A.items():
        if val is None:
            # An entry for lA/uA/lbias/ubias may be None depending on if we are
            # lower or upper bounding the network
            continue
        target = best_A[key][improved_idx]
        source = val[improved_idx]
        if c_mask is not None:
            c_mask_expanded = c_mask.view(
                *c_mask.shape,
                *([1] * (val.dim() - c_mask.dim()))
            ).expand_as(val[improved_idx])
            # Selectively update entries based on c_mask
            best_A[key][improved_idx] = torch.where(c_mask_expanded, source, target)
        else:
            # Replace the entire slice if no mask is provided
            best_A[key][improved_idx] = source


def _update_optimizable_activations(
        optimizable_activations, interm_bounds,
        fix_interm_bounds, best_intermediate_bounds,
        reference_idx, idx, alpha, best_alphas, deterministic):
    """
    Update bounds and alpha of optimizable_activations.
    """
    for node in optimizable_activations:
        # Update best intermediate layer bounds only when they are optimized.
        # If they are already fixed in interm_bounds, then do
        # nothing.
        if node.name not in best_intermediate_bounds:
            continue
        if (interm_bounds is None
                or node.inputs[0].name not in interm_bounds
                or not fix_interm_bounds):
            if deterministic:
                best_intermediate_bounds[node.name][0][idx] = node.inputs[0].lower[reference_idx]
                best_intermediate_bounds[node.name][1][idx] = node.inputs[0].upper[reference_idx]
            else:
                best_intermediate_bounds[node.name][0][idx] = torch.max(
                    best_intermediate_bounds[node.name][0][idx],
                    node.inputs[0].lower[reference_idx])
                best_intermediate_bounds[node.name][1][idx] = torch.min(
                    best_intermediate_bounds[node.name][1][idx],
                    node.inputs[0].upper[reference_idx])
        if alpha:
            # Each alpha has shape (2, output_shape, batch, *shape) for act.
            # For other activation function this can be different.
            for alpha_m in node.alpha:
                best_alphas[node.name][alpha_m][:, :,
                    idx] = node.alpha[alpha_m][:, :, idx]


def update_best_beta(self: 'BoundedModule', enable_opt_interm_bounds, betas,
                     best_betas, idx):
    """
    Update best beta by given idx.
    """
    if enable_opt_interm_bounds and betas:
        for node in self.splittable_activations:
            for node_input in node.inputs:
                for key in node_input.sparse_betas.keys():
                    best_betas[node_input.name][key] = (
                        node_input.sparse_betas[key].val.detach().clone())
        if self.cut_used:
            for gbidx, general_betas in enumerate(self.cut_beta_params):
                # FIXME need to check if 'cut' is a node name
                best_betas['cut'][gbidx] = general_betas.detach().clone()
    else:
        for node in self.nodes_with_beta:
            best_betas[node.name][idx] = node.sparse_betas[0].val[idx]
        if self.cut_used:
            regular_beta_length = len(betas) - len(self.cut_beta_params)
            for cut_beta_idx in range(len(self.cut_beta_params)):
                # general cut beta crown general_betas
                best_betas['cut'][cut_beta_idx][:, :, idx,
                    :] = betas[regular_beta_length + cut_beta_idx][:, :, idx, :]


def _get_optimized_bounds(
        self: 'BoundedModule', x=None, aux=None, C=None, IBP=False,
        forward=False, method='backward', bound_side='lower',
        reuse_ibp=False, return_A=False, average_A=False, final_node_name=None,
        interm_bounds=None, reference_bounds=None,
        aux_reference_bounds=None, needed_A_dict=None, cutter=None,
        decision_thresh=None, epsilon_over_decision_thresh=1e-4):
    """
    Optimize CROWN lower/upper bounds by alpha and/or beta.
    """

    opts = self.bound_opts['optimize_bound_args']
    iteration = opts['iteration']
    max_time = opts['max_time']
    beta = opts['enable_beta_crown']
    alpha = opts['enable_alpha_crown']
    apply_output_constraints_to = opts['apply_output_constraints_to']
    opt_choice = opts['optimizer']
    keep_best = opts['keep_best']
    fix_interm_bounds = opts['fix_interm_bounds']
    loss_reduction_func = opts['loss_reduction_func']
    stop_criterion_func = opts['stop_criterion_func']
    use_float64_in_last_iteration = opts['use_float64_in_last_iteration']
    early_stop_patience = opts['early_stop_patience']
    start_save_best = opts['start_save_best']
    multi_spec_keep_func = opts['multi_spec_keep_func']
    deterministic = opts['deterministic']
    enable_opt_interm_bounds = self.bound_opts.get(
        'enable_opt_interm_bounds', False)
    sparse_intermediate_bounds = self.bound_opts.get(
        'sparse_intermediate_bounds', False)
    verbosity = self.bound_opts['verbosity']

    if bound_side not in ['lower', 'upper']:
        raise ValueError(bound_side)
    bound_lower = bound_side == 'lower'
    bound_upper = bound_side == 'upper'

    assert alpha or beta, (
        'nothing to optimize, use compute bound instead!')

    if C is not None:
        self.final_shape = C.size()[:2]
        self.bound_opts.update({'final_shape': self.final_shape})
    if opts['init_alpha']:
        # TODO: this should set up aux_reference_bounds.
        self.init_alpha(x, share_alphas=opts['use_shared_alpha'],
                        method=method, c=C, final_node_name=final_node_name)

    optimizable_activations = self.get_enabled_opt_act()

    alphas, parameters = [], []
    dense_coeffs_mask = []
    if alpha:
        best_alphas = _set_alpha(
            optimizable_activations, parameters, alphas, opts['lr_alpha'])
    else:
        best_alphas = None
    if beta:
        ret_set_beta = self.set_beta(
            enable_opt_interm_bounds, parameters,
            opts['lr_beta'], opts['lr_cut_beta'], cutter, dense_coeffs_mask)
        betas, best_betas, coeffs, dense_coeffs_mask = ret_set_beta[:4]
    if apply_output_constraints_to is not None and len(apply_output_constraints_to) > 0:
        _set_gammas(self.nodes(), parameters)

    start = time.time()

    if isinstance(decision_thresh, torch.Tensor):
        if decision_thresh.dim() == 1:
            # add the spec dim to be aligned with compute_bounds return
            decision_thresh = decision_thresh.unsqueeze(-1)

    if opts['pruning_in_iteration']:
        if return_A:
            raise NotImplementedError(
                'Pruning in iteration optimization does not support '
                'return A yet. '
                'Please fix or discard this optimization by setting '
                '--disable_pruning_in_iteration '
                'or bab: pruning_in_iteration: false')
        pruner = OptPruner(
            x, threshold=opts['pruning_in_iteration_threshold'],
            multi_spec_keep_func=multi_spec_keep_func,
            loss_reduction_func=loss_reduction_func,
            decision_thresh=decision_thresh,
            epsilon_over_decision_thresh=epsilon_over_decision_thresh,
            fix_interm_bounds=fix_interm_bounds)
    else:
        pruner = None

    if opt_choice == 'adam-autolr':
        opt = AdamElementLR(parameters)
    elif opt_choice == 'adam':
        opt = optim.Adam(parameters)
    elif opt_choice == 'sgd':
        opt = optim.SGD(parameters, momentum=0.9)
    else:
        raise NotImplementedError(opt_choice)

    # Create a weight vector to scale learning rate.
    loss_weight = torch.ones(size=(x[0].size(0),), device=x[0].device)
    scheduler = optim.lr_scheduler.ExponentialLR(opt, opts['lr_decay'])

    # best_intermediate_bounds is linked to aux_reference_bounds!
    best_intermediate_bounds = {}
    if (sparse_intermediate_bounds and aux_reference_bounds is None
            and reference_bounds is not None):
        aux_reference_bounds = {}
        for name, (lb, ub) in reference_bounds.items():
            aux_reference_bounds[name] = [
                lb.detach().clone(), ub.detach().clone()]
    if aux_reference_bounds is None:
        aux_reference_bounds = {}

    if len(apply_output_constraints_to) > 0:
        # INVPROP requires that all layers have cached bounds. This may not be the case
        # unless we explicitly compute them.
        self.bound_opts['optimize_bound_args']['apply_output_constraints_to'] = []
        with torch.no_grad():
            self.compute_bounds(
                x=x, C=C, method='backward', bound_lower=bound_lower,
                bound_upper=bound_upper, final_node_name=final_node_name,
                interm_bounds=interm_bounds)
        self.bound_opts['optimize_bound_args']['apply_output_constraints_to'] = (
            apply_output_constraints_to
        )

    if (return_A and self.output_name[0] in needed_A_dict.keys()
            and self.input_name[0] in needed_A_dict[self.output_name[0]]):
        # If the A dict will be returned, and we expect to retrieve the hyperplanes relating the
        # output layer to the input layer, then we store these keys and pass them to the
        # '_update_best_ret' method so that these entries may be updated during the optimization
        # process. Only these output/input layer entries will be updated, and if other entries need
        # to be updated, '_update_best_ret' is not the correct method to update them.
        out_in_keys = (self.output_name[0], self.input_name[0])
    else:
        out_in_keys = None

    need_grad = True
    patience = 0
    ret_0 = None
    for i in range(iteration):
        if i == 0:
            # If we are at the first iteration, we need to
            # set the constraints_optimized to None
            self.constraints_optimized = None

        if cutter:
            # cuts may be optimized by cutter
            self.cut_module = cutter.cut_module

        if self.constraints_optimized is not None:
            for root in self.roots():
                if ( hasattr(root, 'perturbation')
                    and root.perturbation is not None
                    # Currently constraints solving is designed for LpNorm.
                    and isinstance(root.perturbation, PerturbationLpNorm) ):

                    # Reset the constraints for this root.
                    # TODO: Currently, the `reset` function simply overwrites,
                    #       should support more sophisticated reset logic.
                    root.perturbation.reset_constraints(
                        self.constraints_optimized, decision_thresh)


        intermediate_constr = None

        if not fix_interm_bounds:
            # If we still optimize all intermediate neurons, we can use
            # interm_bounds as reference bounds.
            if reference_bounds is None:
                reference_bounds = {}
            if interm_bounds is not None:
                reference_bounds.update(interm_bounds)
            interm_bounds = {}

        if i == iteration - 1:
            # No grad update needed for the last iteration
            need_grad = False
            if (self.device == 'cuda'
                    and torch.get_default_dtype() == torch.float32
                    and use_float64_in_last_iteration):
                C, x, interm_bounds = self._to_float64(
                    C, x, aux_reference_bounds, interm_bounds)

        if pruner:
            # we will use last update preserve mask in caller functions to recover
            # lA, l, u, etc to full batch size
            self.last_update_preserve_mask = pruner.preserve_mask
            pruner.cache_full_sized_alpha(optimizable_activations)

        # If input bounds are tightened with output constraints, they depend on the
        # relaxations of all other layers. The current iteration will recompute them.
        # This involves concretizing them, so they will depend on themselves.
        # To avoid a loop of gradients, remove gradients here.
        tighten_input_bounds = (
                self.bound_opts['optimize_bound_args']['tighten_input_bounds']
        )
        if tighten_input_bounds:
            for root in self.roots():
                if hasattr(root, 'perturbation') and root.perturbation is not None:
                    root.perturbation.x_L = root.perturbation.x_L.detach()
                    root.perturbation.x_U = root.perturbation.x_U.detach()

        with torch.no_grad() if not need_grad else ExitStack():
            # ret is lb, ub or lb, ub, A_dict (if return_A is set to true)
            ret = self.compute_bounds(
                x, aux, C, method=method, IBP=IBP, forward=forward,
                bound_lower=bound_lower, bound_upper=bound_upper,
                reuse_ibp=reuse_ibp, return_A=return_A,
                final_node_name=final_node_name, average_A=average_A,
                # When intermediate bounds are recomputed, we must set it
                # to None
                interm_bounds=interm_bounds if fix_interm_bounds else None,
                # This is the currently tightest interval, which will be used to
                # pass split constraints when intermediate betas are used.
                reference_bounds=reference_bounds,
                # This is the interval used for checking for unstable neurons.
                aux_reference_bounds=aux_reference_bounds if sparse_intermediate_bounds else None,
                # These are intermediate layer beta variables and their
                # corresponding A matrices and biases.
                intermediate_constr=intermediate_constr,
                needed_A_dict=needed_A_dict,
                update_mask=pruner.preserve_mask if pruner else None,
                cache_bounds=len(apply_output_constraints_to) > 0,
            )
        # If output constraints are used, it's possible that no inputs satisfy them.
        # If one of the layer that uses output constraints realizes this, it sets
        # self.infeasible_bounds = True for this element in the batch.
        if self.infeasible_bounds is not None and torch.any(self.infeasible_bounds):
            if ret[0] is not None:
                ret = (
                    torch.where(
                        self.infeasible_bounds.unsqueeze(1),
                        torch.full_like(ret[0], float('inf')),
                        ret[0],
                    ),
                    ret[1],
                )
            if ret[1] is not None:
                ret = (
                    ret[0],
                    torch.where(
                        self.infeasible_bounds.unsqueeze(1),
                        torch.full_like(ret[1], float('-inf')),
                        ret[1],
                    ),
                )
        ret_l, ret_u = ret[0], ret[1]

        if pruner:
            pruner.recover_full_sized_alpha(optimizable_activations)

        if (self.cut_used and i % cutter.log_interval == 0
                and len(self.cut_beta_params) > 0):
            # betas[-1]: (2(0 lower, 1 upper), spec, batch, num_constrs)
            if ret_l is not None:
                print(i, 'lb beta sum:',
                      f'{self.cut_beta_params[-1][0].sum() / ret_l.size(0)},',
                      f'worst {ret_l.min()}')
            if ret_u is not None:
                print(i, 'lb beta sum:',
                      f'{self.cut_beta_params[-1][1].sum() / ret_u.size(0)},',
                      f'worst {ret_u.min()}')

        if i == 0:
            # save results at the first iteration
            best_ret = [ret.detach().clone() if ret is not None else None for ret in ret[:2]]
            ret_0 = ret[0].detach().clone() if bound_lower else ret[1].detach().clone()

            for node in optimizable_activations:
                if node.inputs[0].lower is None and node.inputs[0].upper is None:
                    continue
                new_intermediate = [node.inputs[0].lower.detach().clone(),
                                    node.inputs[0].upper.detach().clone()]
                best_intermediate_bounds[node.name] = new_intermediate
                if sparse_intermediate_bounds:
                    # Always using the best bounds so far as the reference
                    # bounds.
                    aux_reference_bounds[node.inputs[0].name] = new_intermediate

            if out_in_keys is not None:
                best_out_in_A_dict = clone_sub_A_dict(ret[2], out_in_keys)
            else:
                best_out_in_A_dict = None

        l = ret_l
        # Reduction over the spec dimension.
        if ret_l is not None and ret_l.shape[1] != 1:
            l = loss_reduction_func(ret_l)
        u = ret_u
        if ret_u is not None and ret_u.shape[1] != 1:
            u = loss_reduction_func(ret_u)

        # full_l, full_ret_l and full_u, full_ret_u is used for update the best
        full_ret_l, full_ret_u = ret_l, ret_u
        full_l = l
        full_ret = ret

        if pruner:
            (x, C, full_l, full_ret_l, full_ret_u,
             full_ret, stop_criterion) = pruner.prune(
                x, C, ret_l, ret_u, ret, full_l, full_ret_l, full_ret_u,
                full_ret, interm_bounds, aux_reference_bounds, reference_bounds,
                stop_criterion_func, bound_lower)
        else:
            stop_criterion = (stop_criterion_func(full_ret_l) if bound_lower
                              else stop_criterion_func(-full_ret_u))

        loss_ = l if bound_lower else -u
        total_loss = -1 * loss_
        directly_optimize_layers = self.bound_opts['optimize_bound_args']['directly_optimize']
        for directly_optimize_layer_name in directly_optimize_layers:
            total_loss += (
                self[directly_optimize_layer_name].upper.sum()
                - self[directly_optimize_layer_name].lower.sum()
            )

        if type(stop_criterion) == bool:
            loss = total_loss.sum() * (not stop_criterion)
        else:
            assert total_loss.shape == stop_criterion.shape
            loss = (total_loss * stop_criterion.logical_not()).sum()

        stop_criterion_final = isinstance(
            stop_criterion, torch.Tensor) and stop_criterion.all()

        if i == iteration - 1:
            best_ret = list(best_ret)
            if best_ret[0] is not None:
                best_ret[0] = best_ret[0].to(torch.get_default_dtype())
            if best_ret[1] is not None:
                best_ret[1] = best_ret[1].to(torch.get_default_dtype())

        if (i == iteration - 1 and self.device == 'cuda'
                and torch.get_default_dtype() == torch.float32
                and use_float64_in_last_iteration):
            total_loss, x, full_ret = self._to_default_dtype(
                x, total_loss, full_ret, ret, best_intermediate_bounds, return_A)

        with torch.no_grad():
            # for lb and ub, we update them in every iteration since updating them is cheap
            need_update = False
            improved_idx = None
            if keep_best:
                if best_ret[0] is not None:
                    (
                        best_ret, best_out_in_A_dict,
                        need_update, idx_mask, improved_idx,
                    ) = _update_best_ret(
                        full_ret, best_ret,
                        loss_reduction_func,
                        idx=0,
                        deterministic=deterministic,
                        best_out_in_A_dict=best_out_in_A_dict,
                        out_in_keys=out_in_keys,
                    )
                if best_ret[1] is not None:
                    (
                        best_ret, best_out_in_A_dict,
                        need_update, idx_mask, improved_idx,
                    ) = _update_best_ret(
                        full_ret, best_ret,
                        loss_reduction_func,
                        idx=1,
                        deterministic=deterministic,
                        best_out_in_A_dict=best_out_in_A_dict,
                        out_in_keys=out_in_keys,
                    )
            else:
                # Not saving the best, just keep the last iteration.
                if full_ret[0] is not None:
                    best_ret[0] = full_ret[0]
                if full_ret[1] is not None:
                    best_ret[1] = full_ret[1]

            if return_A:
                best_ret = [best_ret[0], best_ret[1], full_ret[2]]
                if out_in_keys is not None:
                    # Update A_dict entry for output/input layer
                    # This entry corresponds to the best bounds.
                    # Other A_dict entries may not, as they are copied from the last iteration.
                    best_ret[2][out_in_keys[0]][out_in_keys[1]] = best_out_in_A_dict

            patience = 0 if need_update else patience + 1
            time_spent = time.time() - start

            # Save variables if this is the best iteration.
            # To save computational cost, we only check keep_best at the first
            # (in case divergence) and second half iterations
            # or before early stop by either stop_criterion or
            # early_stop_patience reached
            if (
                i < 1
                or i > int(iteration * start_save_best)
                or deterministic
                or stop_criterion_final
                or patience == early_stop_patience
                or time_spent > max_time
            ):
                # compare with the first iteration results and get improved indexes
                if bound_lower:
                    if deterministic:
                        idx_mask, idx = improved_idx, None
                    else:
                        idx_mask, idx = _get_idx_mask(0, full_ret_l, ret_0, loss_reduction_func)
                    ret_0[idx] = full_ret_l[idx]
                else:
                    if deterministic:
                        idx_mask, idx = improved_idx, None
                    else:
                        idx_mask, idx = _get_idx_mask(1, full_ret_u, ret_0, loss_reduction_func)
                    ret_0[idx] = full_ret_u[idx]

                if idx is not None:
                    # for update propose, we condition the idx to update only
                    # on domains preserved
                    if pruner:
                        reference_idx, idx = pruner.prune_idx(idx_mask, idx, x)
                    else:
                        reference_idx = idx

                    _update_optimizable_activations(
                        optimizable_activations, interm_bounds,
                        fix_interm_bounds, best_intermediate_bounds,
                        reference_idx, idx, alpha, best_alphas, deterministic)

                    if beta:
                        self.update_best_beta(enable_opt_interm_bounds, betas,
                                              best_betas, idx)

        if os.environ.get('AUTOLIRPA_DEBUG_OPT', False):
            print(f'****** iter [{i}]',
                  f'loss: {loss.item()}, lr: {opt.param_groups[0]["lr"]}',
                  (' pruning_in_iteration open status: '
                     f'{pruner.pruning_in_iteration}') if pruner else '')

        if stop_criterion_final:
            print(f'\nall verified at {i}th iter')
            break

        if patience > early_stop_patience:
            print(f'Early stop at {i}th iter due to {early_stop_patience}'
                  ' iterations no improvement!')
            break

        if time_spent > max_time:
            print(f'Early stop at {i}th iter due to exceeding the time limit '
                  f'for the optimization (time spent: {time_spent})')
            break

        if i != iteration - 1 and not loss.requires_grad:
            assert i == 0, (i, iteration)
            print('[WARNING] No optimizable parameters found. Will skip optimiziation. '
                  'This happens e.g. if all optimizable layers are freezed or the '
                  'network has no optimizable layers.')
            break

        opt.zero_grad(set_to_none=True)

        if verbosity > 2:
            current_lr = [param_group['lr'] for param_group in opt.param_groups]
            print(f'*** iter [{i}]\n', f'loss: {loss.item()}',
                  total_loss.squeeze().detach().cpu().numpy(), 'lr: ',
                  current_lr)
            if beta:
                print_optimized_beta(optimizable_activations)
            if beta and i == 0 and verbosity > 2:
                breakpoint()

        if i != iteration - 1:
            # we do not need to update parameters in the last step since the
            # best result already obtained
            loss.backward()

            # All intermediate variables are not needed at this point.
            self._clear_and_set_new(
                None,
                cache_bounds=len(apply_output_constraints_to) > 0,
            )
            if opt_choice == 'adam-autolr':
                opt.step(lr_scale=[loss_weight, loss_weight])
            else:
                opt.step()

        if beta:
            for b in betas:
                b.data = (b >= 0) * b.data
            for dmi in range(len(dense_coeffs_mask)):
                # apply dense mask to the dense split coeffs matrix
                coeffs[dmi].data = (
                    dense_coeffs_mask[dmi].float() * coeffs[dmi].data)


        if alpha:
            for m in optimizable_activations:
                m.clip_alpha()
        if apply_output_constraints_to is not None and len(apply_output_constraints_to) > 0:
            for m in self.nodes():
                m.clip_gammas()

        scheduler.step()

        if pruner:
            pruner.next_iter()

    if pruner:
        best_ret = pruner.update_best(full_ret_l, full_ret_u, best_ret)

    if verbosity > 3:
        breakpoint()

    if keep_best:
        # Set all variables to their saved best values.
        with torch.no_grad():
            for idx, node in enumerate(optimizable_activations):
                if node.name not in best_intermediate_bounds:
                    continue
                if alpha:
                    # Assigns a new dictionary.
                    node.alpha = best_alphas[node.name]
                # Update best intermediate layer bounds only when they are
                # optimized. If they are already fixed in
                # interm_bounds, then do nothing.
                best_intermediate = best_intermediate_bounds[node.name]
                node.inputs[0].lower.data = best_intermediate[0].data
                node.inputs[0].upper.data = best_intermediate[1].data
            if beta:
                for node in self.nodes_with_beta:
                    assert getattr(node, 'sparse_betas', None) is not None
                    if enable_opt_interm_bounds:
                        for key in node.sparse_betas.keys():
                            node.sparse_betas[key].val.copy_(
                                best_betas[node.name][key])
                    else:
                        node.sparse_betas[0].val.copy_(best_betas[node.name])
            if self.cut_used:
                for ii in range(len(self.cut_beta_params)):
                    self.cut_beta_params[ii].data = best_betas['cut'][ii].data

    if interm_bounds is not None and not fix_interm_bounds:
        for l in self._modules.values():
            if (l.name in interm_bounds.keys()
                    and l.is_lower_bound_current()):
                l.lower = torch.max(l.lower, interm_bounds[l.name][0])
                l.upper = torch.min(l.upper, interm_bounds[l.name][1])
                infeasible_neurons = l.lower > l.upper
                if infeasible_neurons.any():
                    print(f'Infeasibility detected in layer {l.name}.',
                          infeasible_neurons.sum().item(),
                          infeasible_neurons.nonzero()[:, 0])

    if verbosity > 0:
        if best_ret[0] is not None:
            # FIXME: unify the handling of l and u.
            print('best_l after optimization:', best_ret[0].sum().item())
            if beta:
                print('beta sum per layer:', [p.sum().item() for p in betas])
        print('alpha/beta optimization time:', time.time() - start)

    for node in optimizable_activations:
        node.opt_end()

    if pruner:
        pruner.update_ratio(full_l, full_ret_l)
        pruner.clean_full_sized_alpha_cache()

    if os.environ.get('AUTOLIRPA_DEBUG_OPT', False):
        print()

    return best_ret


def init_alpha(self: 'BoundedModule', x, share_alphas=False, method='backward',
               c=None, bound_lower=True, bound_upper=True, final_node_name=None,
               interm_bounds=None, reference_alphas=None,
               skip_bound_compute=False):
    self(*x) # Do a forward pass to set perturbed nodes
    final = (self.final_node() if final_node_name is None
             else self[final_node_name])
    self._set_used_nodes(final)

    optimizable_activations = self.get_enabled_opt_act()
    for node in optimizable_activations:
        # TODO(7/6/2023) In the future, we may need to enable alpha sharing
        # automatically by consider the size of all the optimizable nodes in the
        # graph. For now, only an adhoc check in MatMul is added.
        node._all_optimizable_activations = optimizable_activations

        # initialize the parameters
        node.opt_init()

    apply_output_constraints_to = (
        self.bound_opts['optimize_bound_args']['apply_output_constraints_to']
    )
    if (not skip_bound_compute or interm_bounds is None or
            reference_alphas is None or not all(
                [act.name in reference_alphas
                 for act in optimizable_activations])):
        skipped = False
        # if new interval is None, then CROWN interval is not present
        # in this case, we still need to redo a CROWN pass to initialize
        # lower/upper
        with torch.no_grad():
            # We temporarilly deactivate output constraints
            self.bound_opts['optimize_bound_args']['apply_output_constraints_to'] = []
            l, u = self.compute_bounds(
                x=x, C=c, method=method, bound_lower=bound_lower,
                bound_upper=bound_upper, final_node_name=final_node_name,
                interm_bounds=interm_bounds)
            self.bound_opts['optimize_bound_args']['apply_output_constraints_to'] = (
                apply_output_constraints_to
            )
            if len(apply_output_constraints_to) > 0:
                # Some layers, such as the BoundTanh layer, do some of their initialization
                # in the forward pass. We need to call the forward pass again to ensure
                # that they are initialized for the output constraints, too.
                l, u = self.compute_bounds(
                    x=x, C=c, method=method, bound_lower=bound_lower,
                    bound_upper=bound_upper, final_node_name=final_node_name,
                    interm_bounds=interm_bounds, cache_bounds=True)
    else:
        # we skip, but we still would like to figure out the "used",
        # "perturbed", "backward_from" of each note in the graph
        skipped = True
        # this set the "perturbed" property
        self.set_input(*x, interm_bounds=interm_bounds)
        self.backward_from = {node: [final] for node in self._modules}
        l = u = None

    final_node_name = final_node_name or self.final_name

    init_intermediate_bounds = {}
    for node in optimizable_activations:
        start_nodes = []
        if method in ['forward', 'forward+backward']:
            start_nodes.append(('_forward', 1, None, False))
        if method in ['backward', 'forward+backward']:
            start_nodes += self.get_alpha_crown_start_nodes(
                node,
                c=c,
                share_alphas=share_alphas,
                final_node_name=final_node_name,
            )
        if not start_nodes:
            continue
        if skipped:
            node.restore_alpha(reference_alphas[node.name], device=x[0].device, dtype=x[0].dtype)

        else:
            node.init_opt_parameters(start_nodes)
        if node in self.splittable_activations:
            for i in node.requires_input_bounds:
                input_node = node.inputs[i]
                if (not input_node.perturbed
                        or node.inputs[i].lower is None
                        and node.inputs[i].upper is None):
                    continue
                init_intermediate_bounds[node.inputs[i].name] = (
                    [node.inputs[i].lower.detach(),
                    node.inputs[i].upper.detach()])
    if (
        apply_output_constraints_to is not None
        and len(apply_output_constraints_to) > 0
        and hasattr(self, 'constraints')
    ):
        # self.constraints.shape = (batch_size, num_constraints, num_output_neurons)
        # For abCROWN we know that:
        # If the output constraints are a conjunction, the shape is (1, num_constraints, *)
        # If the output constraints are a disjunction, the shape is (num_constraints, 1, *)
        # Checking which entry is 1 allows to discern both cases.
        # If auto_LiRPA is used directly, we could have batches of inputs with more than one
        # constraint. This is currently not supported.
        if self.constraints.size(0) == 1:
            num_gammas = self.constraints.size(1)
        elif self.constraints.size(1) == 1:
            num_gammas = self.constraints.size(0)
        else:
            raise NotImplementedError(
                'To use output constraints, either have a batch size of 1 or use only one '
                'output constraint'
            )
        for node in self.nodes():
            node.init_gammas(num_gammas)

    if self.bound_opts['verbosity'] >= 1:
        print('Optimizable variables initialized.')
    if skip_bound_compute:
        return init_intermediate_bounds
    else:
        return l, u, init_intermediate_bounds


================================================
FILE: auto_LiRPA/output_constraints.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################

from .utils import *
from .bound_ops import *

from typing import TYPE_CHECKING
if TYPE_CHECKING:
    from .bound_general import BoundedModule


def invprop_enabled(self: 'BoundedModule'):
    return self.bound_opts['optimize_bound_args']['apply_output_constraints_to']


def invprop_init_infeasible_bounds(self: 'BoundedModule', bound_node, C):
    # Infeasible bounds can result from unsatisfiable output constraints.
    # We track them to set the corresponding lower bounds to inf and upper bounds to
    # -inf.
    if self.infeasible_bounds is None:
        device = bound_node.attr['device']
        if isinstance(C, Patches):
            self.infeasible_bounds = torch.full((C.shape[1],), False, device=device)
        else:
            assert isinstance(C, (torch.Tensor, eyeC, OneHotC)), type(C)
            self.infeasible_bounds = torch.full((C.shape[0],), False, device=device)


def invprop_check_infeasible_bounds(self: 'BoundedModule', lb, ub):
    if torch.any(self.infeasible_bounds):
        if lb is not None:
            assert lb.size(0) == self.infeasible_bounds.size(0)
            lb = torch.where(self.infeasible_bounds.unsqueeze(1),
                             torch.tensor(float('inf'), device=lb.device), lb)
        if ub is not None:
            assert ub.size(0) == self.infeasible_bounds.size(0)
            ub = torch.where(self.infeasible_bounds.unsqueeze(1),
                             torch.tensor(float('-inf'), device=ub.device), ub)
    return lb, ub


def backward_general_invprop(
    self: 'BoundedModule',
    initial_As, initial_lb, initial_ub,
    bound_node,
    C,
    start_backpropagation_at_node = None,
    bound_lower=True,
    bound_upper=True,
    average_A=False,
    need_A_only=False,
    unstable_idx=None,
    update_mask=None,
):
    use_beta_crown = self.bound_opts['optimize_bound_args']['enable_beta_crown']
    # Sometimes, not using output constraints can give better results.
    # When this flag is set, the bounds are computed both with and without
    # output constraints, and the best of the two is returned.
    best_of_oc_and_no_oc = (
        self.bound_opts['optimize_bound_args']['best_of_oc_and_no_oc']
    )

    assert not use_beta_crown
    assert not self.cut_used
    assert initial_As is None
    assert initial_lb is None
    assert initial_ub is None
    if best_of_oc_and_no_oc:
        # Important: If input bounds are tightened, then this call must be done
        # *before* the use of output constraints.
        # At the end of backward_general, the bounds are concretized. For the input
        # bounds, those concrete bounds are used to overwrite the bounds in the
        # input perturbations, so they'll then be used by all other layers during
        # their concretization. These input bounds *must* have their gradients
        # w.r.t. the relaxations set up. The call to backward_general without
        # output constraints will overwrite these bounds with values that do not
        # have gradients. So it must come first.
        with torch.no_grad():
            o_res = self.backward_general(
                bound_node=bound_node,
                C=C,
                start_backpropagation_at_node=start_backpropagation_at_node,
                bound_lower=bound_lower,
                bound_upper=bound_upper,
                average_A=average_A,
                need_A_only=need_A_only,
                unstable_idx=unstable_idx,
                update_mask=update_mask,
                apply_output_constraints_to=[],
            )
    res = self.backward_general_with_output_constraint(
        bound_node=bound_node,
        C=C,
        start_backporpagation_at_node=start_backpropagation_at_node,
        bound_lower=bound_lower,
        bound_upper=bound_upper,
        average_A=average_A,
        need_A_only=need_A_only,
        unstable_idx=unstable_idx,
        update_mask=update_mask,
    )
    if best_of_oc_and_no_oc:
        # We use the best of both results. This would convert Infs to NaNs
        # (because inf - inf = nan), so those entries get masked.
        res0_inf_mask = torch.isinf(res[0])
        r0 = res[0] - res[0].detach() + torch.max(res[0].detach(), o_res[0].detach())
        r0 = torch.where(res0_inf_mask, res[0], r0)
        res1_inf_mask = torch.isinf(res[1])
        r1 = res[1] - res[1].detach() + torch.min(res[1].detach(), o_res[1].detach())
        r1 = torch.where(res1_inf_mask, res[1], r1)
        if self.return_A:
            if res[2] != {}:
                raise NotImplementedError(
                    "Merging of A not implemented yet. If set, try disabling --best_of_oc_and_no_oc"
                )
            res = (r0, r1, {})
        else:
            res = (r0, r1)
    batch_size = res[0].size(0)
    infeasible_bounds = torch.any(res[0].reshape((batch_size, -1)) > res[1].reshape((batch_size, -1)), dim=1)
    if torch.any(infeasible_bounds):
        self.infeasible_bounds = torch.logical_or(self.infeasible_bounds, infeasible_bounds)
    return res


def backward_general_with_output_constraint(
    self: 'BoundedModule',
    bound_node,
    C,
    start_backporpagation_at_node = None,
    bound_lower=True,
    bound_upper=True,
    average_A=False,
    need_A_only=False,
    unstable_idx=None,
    update_mask=None,
):
    assert start_backporpagation_at_node is None
    assert not isinstance(C, str)

    neurons_in_layer = 1
    for d in bound_node.output_shape[1:]:
        neurons_in_layer *= d

    # backward_general uses C to compute batch_size, output_dim and output_shape, just like below.
    # When output constraints are applied, it will perform a different backpropagation,
    # but those variables need to be computed regardless. So we need to retain the original C
    # and pass it on to backward_general. If initial_As is set (which it is, if this code here
    # is executed), it will not use C for anything else.
    orig_C = C

    C, batch_size, output_dim, output_shape = self._preprocess_C(C, bound_node)
    device = bound_node.device
    if device is None and hasattr(C, 'device'):
        device = C.device
    # self.constraints.shape == (batch_size, num_constraints, output_neurons)
    batch_size = self.constraints.size(0)
    num_constraints = self.constraints.size(1)

    # 1) Linear: Hx + d
    # Result is a tensor, <= 0 for all entries if output constraint is satisfied
    H = self.constraints.transpose(1,2)  # (batch_size, output_neurons, num_constraints)
    d = -self.thresholds  # (batch)
    assert H.ndim == 3
    assert H.size(0) == batch_size
    assert H.size(2) == num_constraints
    assert d.ndim == 1
    if batch_size > 1:
        assert num_constraints == 1
        assert d.size(0) == batch_size
    else:
        assert d.size(0) == num_constraints

    if hasattr(bound_node, 'gammas'):
        gammas = bound_node.gammas
    else:
        if hasattr(bound_node, 'opt_stage'):
            assert bound_node.opt_stage not in ['opt', 'reuse']
        if batch_size == 1:
            gammas = torch.zeros((2, num_constraints, neurons_in_layer), device=device)
        else:
            gammas = torch.zeros((2, batch_size, neurons_in_layer), device=device)

    # H.shape = (batch_size, output_neurons, num_constraints==1)
    # We need used_weight.shape = (batch_size, this_layer_neurons, prev_layer_neurons)
    # This is satisfied by H, because it will be transposed before being accessed and
    # output_neurons == prev_layer_neurons
    linear_Hxd_layer_weight_value = nn.Parameter(H.to(gammas))
    linear_Hxd_layer_weight = BoundParams(
        ori_name="/linear_Hxd_layer_weight",
        value=None,
        perturbation=None,
    )
    linear_Hxd_layer_weight.name = "linear_Hxd_layer_weight"
    linear_Hxd_layer_weight.lower = linear_Hxd_layer_weight_value
    linear_Hxd_layer_weight.upper = linear_Hxd_layer_weight_value

    if batch_size == 1:
        linear_Hxd_layer_bias_value = nn.Parameter(d.float().to(device))
    else:
        linear_Hxd_layer_bias_value = nn.Parameter(d.float().to(device).unsqueeze(1))
    linear_Hxd_layer_bias = BoundParams(
        ori_name="/linear_Hxd_layer_bias",
        value=None,
        perturbation=None,
    )
    linear_Hxd_layer_bias.name = "linear_Hxd_layer_bias"
    linear_Hxd_layer_bias.lower = linear_Hxd_layer_bias_value
    linear_Hxd_layer_bias.upper = linear_Hxd_layer_bias_value

    linear_Hxd_layer = BoundLinear(
        attr=None,
        inputs=[
            self.final_node(),
            linear_Hxd_layer_weight,
            linear_Hxd_layer_bias,
        ],
        output_index=0,
        options=self.bound_opts,
    )
    linear_Hxd_layer.name = "/linear_Hxd_layer"
    linear_Hxd_layer.device = device
    linear_Hxd_layer.perturbed = True
    linear_Hxd_layer.output_shape = torch.Size([1, num_constraints])
    linear_Hxd_layer.batch_dim = bound_node.batch_dim
    linear_Hxd_layer.batched_weight_and_bias = (batch_size > 1)

    # 2) Gamma
    # A seperate gamma per output constraint. All gammas are always positive.
    # Depending on the configuration, gammas are shared across neurons in the
    # optimized layer.
    gamma_layer_weight = BoundParams(
        ori_name="/gamma_layer_weight",
        value=None,
        perturbation=None,
    )
    gamma_layer_weight.name = "gamma_layer_weight"
    assert gammas.ndim == 3
    assert gammas.size(0) == 2
    if batch_size == 1:
        # gammas.shape = (2, num_constraints, this_layer_neurons)
        assert gammas.ndim == 3
        assert gammas.size(0) == 2
        assert gammas.size(1) == num_constraints
        this_layer_neurons = gammas.size(2)

        # In linear.py, these weights will be used to compute next_A based on last_A:
        # last_A.shape = (unstable_neurons, batch_size==1, this_layer_neurons)
        # next_A.shape = (unstable_neurons, batch_size==1, prev_layer_neurons)
        # prev_layer_neurons == num_constraints
        # So we set the weights as
        # (num_constraints, this_layer_neurons)
        # This will be transposed and accessed by linear.py as
        # (this_layer_neurons, num_constraints)
        # Note that the shape will be further modified in linear.py
        gamma_layer_weight.lower = gammas[0].unsqueeze(0)
        gamma_layer_weight.upper = -gammas[1].unsqueeze(0)
    else:
        # ABCrown optimized the computation by transposing the query.
        # Instead of one batch entry with N constraints, we have N batch entries
        # with one contraint each. We do not support multiple batch entries
        # each with multiple constraints.
        # gammas.shape = (2, batch_size, this_layer_neurons)
        # Here, we can only check that the batch size is correct.
        assert gammas.size(1) == batch_size
        assert num_constraints == 1

        this_layer_neurons = gammas.size(2)

        # In linear.py, these weights will be used to compute next_A based on last_A:
        # last_A.shape = (unstable_neurons, batch_size, this_layer_neurons)
        # next_A.shape = (unstable_neurons, batch_size, prev_layer_neurons==1)
        # prev_layer_neurons == 1 because it's num_constraints
        # So we set the weights as
        # (batch_size, 1, this_layer_neurons)
        # This will be transposed and accessed by linear.py as
        # (batch_size, this_layer_neurons, 1)
        # Note that the shape will be further modified in linear.py
        gamma_layer_weight.lower = gammas[0].unsqueeze(1)
        gamma_layer_weight.upper = -gammas[1].unsqueeze(1)
    gamma_layer = BoundLinear(
        attr=None,
        inputs=[linear_Hxd_layer, gamma_layer_weight],
        output_index=0,
        options=self.bound_opts,
    )
    gamma_layer.name = "/gamma_layer"
    gamma_layer.device = device
    gamma_layer.perturbed = True
    gamma_layer.input_shape = linear_Hxd_layer.output_shape
    gamma_layer.output_shape = torch.Size([1, this_layer_neurons])
    gamma_layer.batch_dim = bound_node.batch_dim
    gamma_layer.use_seperate_weights_for_lower_and_upper_bounds = True
    gamma_layer.batched_weight_and_bias = (batch_size > 1)

    # 3) Reshape
    # To the same shape as the layer that's optimized.
    reshape_layer_output_shape = BoundBuffers(
        ori_name="/reshape_layer_output_shape",
        value = torch.tensor(bound_node.output_shape[1:]),
        perturbation=None,
        options=self.bound_opts,
    )
    reshape_layer_output_shape.name = "reshape_layer_output_shape"
    reshape_layer = BoundReshape(
        attr=None,
        inputs = [gamma_layer, reshape_layer_output_shape],
        output_index=0,
        options=self.bound_opts,
    )
    reshape_layer.name = "/reshape_layer"
    reshape_layer.device = device
    reshape_layer.perturbed = True
    reshape_layer.input_shape = gamma_layer.output_shape
    reshape_layer.output_shape = bound_node.output_shape
    reshape_layer.batch_dim = bound_node.batch_dim

    # The residual connection that connects the optimized layer and the reshape
    # layer from above is not explicitly coded, it's handled implicitly:
    # Here, we propagate backwards through 5->4->3->2->1->regular output layer and let
    # CROWN handle the propagation from there on backwards to the input layer.
    # The other half of the residual connection is implemented by explicitly setting
    # the .lA and .uA values of the optimized layer to C.
    # This is done via initial_As, initial_lb, initial_ub.

    if isinstance(C, (OneHotC, eyeC)):
        batch_size = C.shape[1]
        assert C.shape[0] <= C.shape[2]
        assert len(C.shape) == 3
        # This is expensive, but Reshape doesn't support OneHotC objects
        if isinstance(C, OneHotC):
            C = torch.eye(C.shape[2], device=C.device)[C.index].unsqueeze(1).expand(-1, batch_size, -1)
        else:
            C = torch.eye(C.shape[2], device=C.device).unsqueeze(1).expand(-1, batch_size, -1)

    start_shape = None
    lA = C if bound_lower else None
    uA = C if bound_upper else None

    # 3) Reshape
    A, lower_b, upper_b = reshape_layer.bound_backward(
        lA, uA, *reshape_layer.inputs,
        start_node=bound_node, unstable_idx=unstable_idx,
        start_shape=start_shape)
    assert lower_b == 0
    assert upper_b == 0
    lA = A[0][0]
    uA = A[0][1]

    # 2) Gamma
    A, lower_b, upper_b = gamma_layer.bound_backward(
        lA, uA, *gamma_layer.inputs,
        start_node=bound_node, unstable_idx=unstable_idx,
        start_shape=start_shape)
    assert lower_b == 0
    assert upper_b == 0
    lA = A[0][0]
    uA = A[0][1]

    # 1) Hx + d
    A, lower_b, upper_b = linear_Hxd_layer.bound_backward(
        lA, uA, *linear_Hxd_layer.inputs,
        start_node=bound_node, unstable_idx=unstable_idx,
        start_shape=start_shape)
    # lower_b and upper_b are no longer 0, because d wasn't 0.
    lA = A[0][0]
    uA = A[0][1]

    # This encodes the residual connection.
    initial_As = {
        self.final_node().name: (lA, uA),
        bound_node.name: (C, C),
    }

    assert lower_b.ndim == 2
    assert upper_b.ndim == 2

    return self.backward_general(
        bound_node = bound_node,
        start_backpropagation_at_node = self.final_node(),
        C = orig_C,  #  only used for batch_size, output_dim, output_shape computation
        bound_lower = bound_lower,
        bound_upper = bound_upper,
        average_A = average_A,
        need_A_only = need_A_only,
        unstable_idx = unstable_idx,
        update_mask = update_mask,
        apply_output_constraints_to = [],  # no nested application
        initial_As = initial_As,
        initial_lb = lower_b,
        initial_ub = upper_b,
    )


================================================
FILE: auto_LiRPA/parse_graph.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
import torch
from torch.onnx.utils import _optimize_graph
from collections import OrderedDict
from collections import namedtuple
from packaging import version
import re
import os
import traceback
from .bounded_tensor import BoundedTensor, BoundedParameter
from .utils import logger, unpack_inputs

Node = namedtuple('Node', (
    'name', 'ori_name', 'inputs', 'attr', 'op', 'param', 'input_index',
    'bound_node', 'output_index', 'perturbation'), defaults=(None,) * 10)

def get_node_name(node):
    return node.debugName()

def get_node_attribute(node, attribute_name):
    if hasattr(torch.onnx.symbolic_helper, '_node_get'):
        # Pytorch >= 1.13.
        return torch.onnx.symbolic_helper._node_get(node, attribute_name)
    else:
        # Pytorch <= 1.12. This will call _node_getitem in torch.onnx.utils.
        return node[attribute_name]

def parse_graph(graph, inputs, params):
    input_all = []
    input_used = []
    scope = {}
    for n in graph.inputs():
        input_all.append(n.debugName())
    for n in graph.nodes():
        n_inputs = [get_node_name(i) for i in n.inputs()]
        for inp in n.inputs():
            input_used.append(inp.debugName())
        for out in n.outputs():
            scope[get_node_name(out)] = n.scopeName()
    for node in graph.inputs():
        name = get_node_name(node)
        scope[name] = ''
    for n in graph.outputs():
        name = get_node_name(n)
        if name in input_all:
            # This output node directly comes from an input node with an Op
            input_used.append(n.debugName())

    def name_with_scope(node):
        name = get_node_name(node)
        name = '/'.join([scope[name], name])
        if '.' in name:
            # "." should not be used as it could issues in state_dict loading
            # where PyTorch would treat it as having submodules
            name = name.replace('.', '-')
        return name

    nodesOP = []
    for n in graph.nodes():
        attrs = {k: get_node_attribute(n, k) for k in n.attributeNames()}
        n_inputs = [name_with_scope(i) for i in n.inputs()]
        for i, out in enumerate(list(n.outputs())):
            nodesOP.append(Node(**{
                'name': name_with_scope(out),
                'op': n.kind(),
                'inputs': n_inputs,
                'attr': attrs,
                'output_index': i,
            }))

    # filter out input nodes in `graph.inputs()` that are actually used
    nodesIn = []
    used_by_index = []
    for i, n in enumerate(graph.inputs()):
        name = get_node_name(n)
        used = name in input_used
        used_by_index.append(used)
        if used:
            nodesIn.append(n)

    # filter out input nodes in `inputs` that are actually used
    inputs_unpacked = unpack_inputs(inputs)
    assert len(list(graph.inputs())) == len(inputs_unpacked) + len(params)
    inputs = [inputs_unpacked[i] for i in range(len(inputs_unpacked)) if used_by_index[i]]
    # index of the used inputs among all the inputs
    input_index = [i for i in range(len(inputs_unpacked)) if used_by_index[i]]
    # Add a name to all inputs
    inputs = list(zip(["input_{}".format(input_index[i]) for i in range(len(inputs))], inputs))
    # filter out params that are actually used
    params = [params[i] for i in range(len(params)) if used_by_index[i + len(inputs_unpacked)]]
    inputs_and_params = inputs + params
    assert len(nodesIn) == len(inputs_and_params)

    # output nodes of the module
    nodesOut = []
    for n in graph.outputs():
        # we only record names
        nodesOut.append(name_with_scope(n))

    for i, n in enumerate(nodesIn):
        if (isinstance(inputs_and_params[i][1], BoundedTensor) or
                isinstance(inputs_and_params[i][1], BoundedParameter)):
            perturbation = inputs_and_params[i][1].ptb
        else:
            perturbation = None
        if i > 0 and n.type().sizes() != list(inputs_and_params[i][1].size()):
            raise RuntimeError("Input tensor shapes do not much: {} != {}".format(
                n.type().sizes(), list(inputs_and_params[i][1].size())))
        name = name_with_scope(n)
        nodesIn[i] = Node(**{
            'name': name,
            'ori_name': inputs_and_params[i][0],
            'op': 'Parameter',
            'inputs': [],
            'attr': str(n.type()),
            'param': inputs_and_params[i][1] if i >= len(inputs) else None,
            # index among all the inputs including unused ones
            'input_index': input_index[i] if i < len(inputs) else None,
            # Input nodes may have perturbation, if they are wrapped in BoundedTensor or BoundedParameters
            'perturbation': perturbation,
        })

    return nodesOP, nodesIn, nodesOut

def _get_jit_params(module, param_exclude, param_include):
    state_dict = torch.jit._unique_state_dict(module, keep_vars=True)

    if param_exclude is not None:
        param_exclude = re.compile(param_exclude)
    if param_include is not None:
        param_include = re.compile(param_include)

    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        if param_exclude is not None and param_exclude.match(k) is not None:
            print(f'\nremove input element {k} from nodesIn\n')
            continue
        if param_include is not None and param_include.match(k) is None:
            continue
        new_state_dict[k] = v

    params = zip(new_state_dict.keys(), new_state_dict.values())

    return params

def get_output_template(out):
    """Construct a template for the module output with `None` representing places
    to be filled with tensor results"""
    if isinstance(out, torch.Tensor):
        return None
    elif isinstance(out, list):
        return list([get_output_template(o) for o in out])
    elif isinstance(out, tuple):
        return tuple([get_output_template(o) for o in out])
    elif isinstance(out, dict):
        template = {}
        for key in out:
            template[key] = get_output_template(out[key])
        return template
    else:
        raise NotImplementedError

def parse_source(node):
    kind = node.kind()
    if hasattr(node, 'sourceRange'):
        source_range_str = node.sourceRange()
        # divide source_range_str by '\n' and drop any lines containing 'torch.nn'
        source_range_str = '\n'.join([line for line in source_range_str.split('\n') if 'torch/nn' not in line])
        match = re.match(r'([^ ]+\.py)\((\d+)\)', source_range_str)
        if match:
            # match.group(1) is the file name
            # match.group(2) is the line number
            return f"{kind}_{os.path.basename(match.group(1)).split('.')[0]}_{match.group(2)}"
    return kind

def update_debug_names(trace_graph):
    visited = []
    for n in trace_graph.nodes():
        for input in n.inputs():
            if input.debugName() not in visited:
                input.setDebugName(f"{input.debugName()}_{parse_source(n)}")
                visited.append(input.debugName())
        for output in n.outputs():
            if output.debugName() not in visited:
                output.setDebugName(f"{output.debugName()}_{parse_source(n)}")
                visited.append(output.debugName())

def parse_module(module, inputs, param_exclude=".*AuxLogits.*", param_include=None):
    params = _get_jit_params(module, param_exclude=param_exclude, param_include=param_include)
    try:
        trace, out = torch.jit._get_trace_graph(module, inputs)
    except:
        print(traceback.format_exc())
        raise RuntimeError(
            'Failed to get the trace. '
            'Please check that the model and inputs are compatible with torch.jit.')

    if version.parse(torch.__version__) < version.parse("2.0.0"):
        from torch.onnx.symbolic_helper import _set_opset_version
        _set_opset_version(12)
    if version.parse(torch.__version__) >= version.parse("2.1.0"):
        # This is needed for BoundConcatGrad to work with torch 2.1.0 and later
        if version.parse(torch.__version__) < version.parse("2.9.0"):
            from torch.onnx._globals import GLOBALS
        else:
            from torch.onnx._internal.torchscript_exporter._globals import GLOBALS
        GLOBALS.autograd_inlining = False

    logger.debug("Graph before ONNX convertion:")
    logger.debug(trace)

    # Assuming that the first node in the graph is the primary input node.
    # It must have a batch dimension.
    primary_input = get_node_name(next(iter(trace.inputs())))
    trace_graph = _optimize_graph(
        trace, torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
        params_dict={},
        input_names=[primary_input],
        dynamic_axes={primary_input: {0: 'batch'}})
    logger.debug('trace_graph: %s', trace_graph)

    if os.environ.get('AUTOLIRPA_DEBUG_NAMES', 0):
        update_debug_names(trace_graph)

    logger.debug("ONNX graph:")
    logger.debug(trace_graph)

    if not isinstance(inputs, tuple):
        inputs = (inputs, )

    nodesOP, nodesIn, nodesOut = parse_graph(trace_graph, tuple(inputs), tuple(params))

    for i in range(len(nodesOP)):
        param_in = OrderedDict()
        for inp in nodesOP[i].inputs:
            for n in nodesIn:
                if inp == n.name:
                    param_in.update({inp:n.param})
        nodesOP[i] = nodesOP[i]._replace(param=param_in)

    template = get_output_template(out)

    return nodesOP, nodesIn, nodesOut, template


================================================
FILE: auto_LiRPA/patches.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
import torch
import torch.nn.functional as F
from torch import Tensor


def insert_zeros(image, s):
    """
    Insert s columns and rows 0 between every pixel in the image. For example:
    image = [[1, 2, 3],
             [4, 5, 6],
             [7, 8, 9]]
    s = 2
    output = [[1, 0, 0, 2, 0, 0, 3],
              [0, 0, 0, 0, 0, 0, 0],
              [0, 0, 0, 0, 0, 0, 0],
              [4, 0, 0, 5, 0, 0, 6],
              [0, 0, 0, 0, 0, 0, 0],
              [0, 0, 0, 0, 0, 0, 0],
              [7, 0, 0, 8, 0, 0, 9]]
    """
    if s <= 0:
        return image
    matrix = torch.zeros(size=(image.size(0), image.size(1), image.size(2) * (s+1) - s, image.size(3) * (s+1) - s), dtype=image.dtype, device=image.device)
    matrix_stride = matrix.stride()
    selected_matrix = torch.as_strided(matrix, [
          # Shape of the output matrix.
          matrix.size(0),  # Batch size.
          matrix.size(1),  # Channel.
          image.size(2),  # H (without zeros)
          image.size(3),  # W (without zeros)
          ], [
          # Stride of the output matrix.
          matrix_stride[0],  # Batch size dimension, keep using the old stride.
          matrix_stride[1],  # Channel dimension.
          matrix_stride[2] * (s + 1),  # Move s+1 rows.
          s+1,  # Move s+1 pixels.
    ])  # Move a pixel (on the width direction).
    selected_matrix[:] = image
    return matrix


def remove_zeros(image, s, remove_zero_start_idx=(0,0)):
    if s <= 0:
        return image
    matrix_stride = image.stride()
    storage_offset = image.storage_offset()
    return torch.as_strided(image, [
        # Shape of the output matrix.
        *image.shape[:-2],
        (image.size(-2) - remove_zero_start_idx[-2] + (s + 1) - 1) // (s + 1),  # H (without zeros)
        (image.size(-1) - remove_zero_start_idx[-1] + (s + 1) - 1) // (s + 1),  # W (without zeros)
        ], [
        # Stride of the output matrix.
        *matrix_stride[:-2],
        matrix_stride[-2] * (s + 1),  # Move s+1 rows.
        matrix_stride[-1] * (s + 1),  # Move s+1 pixels.
        ],
        storage_offset + matrix_stride[-2] * remove_zero_start_idx[-2] + matrix_stride[-1] * remove_zero_start_idx[-1]
    )


def unify_shape(shape):
    """
    Convert shapes to 4-tuple: (left, right, top, bottom).
    """
    if shape is not None:
        if isinstance(shape, int):
            # Same on all four directions.
            shape = (shape, shape, shape, shape)
        if len(shape) == 2:
            # (height direction, width direction).
            shape = (shape[1], shape[1], shape[0], shape[0])
        assert len(shape) == 4
    # Returned: (left, right, top, bottom).
    return shape


def simplify_shape(shape):
    """
    Convert shapes to 2-tuple or a single number.
    Used to avoid extra padding operation because the padding
    operation in F.conv2d is not general enough.
    """
    if len(shape) == 4:
        # 4-tuple: (left, right, top, bottom).
        if shape[0] == shape[1] and shape[2] == shape[3]:
            shape = (shape[2], shape[0])
    if len(shape) == 2:
        # 2-tuple: (height direction, width direction).
        if shape[0] == shape[1]:
            shape = shape[0]
    return shape


def is_shape_used(shape, expected=0):
    if isinstance(shape, int):
        return shape != expected
    else:
        return sum(shape) != expected


class Patches:
    """
    A special class which denotes a convoluntional operator as a group of patches
    the shape of Patches.patches is [batch_size, num_of_patches, out_channel, in_channel, M, M]
    M is the size of a single patch
    Assume that we have a conv2D layer with w.weight(out_channel, in_channel, M, M), stride and padding applied on an image (N * N)
    num_of_patches = ((N + padding * 2 - M)//stride + 1) ** 2
    Here we only consider kernels with the same H and W
    """
    def __init__(
            self, patches=None, stride=1, padding=0, shape=None, identity=0,
            unstable_idx=None, output_shape=None, inserted_zeros=0, output_padding=0, input_shape=None):
        # Shape: [batch_size, num_of_patches, out_channel, in_channel, M, M]
        # M is the size of a single patch
        # Assume that we have a conv2D layer with w.weight(out_channel, in_channel, M, M), stride and padding applied on an image (N * N)
        # num_of_patches = ((N + padding * 2 - M)//stride + 1) ** 2
        # Here we only consider kernels with the same H and W
        self.patches = patches
        self.stride = stride
        self.padding = padding
        self.shape = shape
        self.identity = identity
        self.unstable_idx = unstable_idx
        self.output_shape = output_shape
        self.input_shape = input_shape
        self.inserted_zeros = inserted_zeros
        self.output_padding = output_padding
        self.simplify()

    def __add__(self, other):
        if isinstance(other, Patches):
            # Insert images with zero to make stride the same, if necessary.
            assert self.stride == other.stride
            if self.unstable_idx is not None or other.unstable_idx is not None:
                if self.unstable_idx is not other.unstable_idx:  # Same tuple object.
                    raise ValueError('Please set bound option "sparse_conv_intermediate_bounds" to False to run this model.')
                assert self.output_shape == other.output_shape
            A1 = self.patches
            A2 = other.patches
            # change paddings to merge the two patches
            sp = torch.tensor(unify_shape(self.padding))
            op = torch.tensor(unify_shape(other.padding))
            if (sp - op).abs().sum().item() > 0:
                if (sp - op >= 0).all():
                    A2 = F.pad(A2, (sp - op).tolist())
                    pass
                elif (sp - op <= 0).all():
                    A1 = F.pad(A1, (op - sp).tolist())
                else:
                    raise ValueError("Unsupported padding size")
            ret = A1 + A2
            return Patches(ret, other.stride, torch.max(sp, op).tolist(),
                           ret.shape, unstable_idx=self.unstable_idx, output_shape=self.output_shape,
                           inserted_zeros=self.inserted_zeros, output_padding=self.output_padding)
        else:
            assert self.inserted_zeros == 0
            assert not is_shape_used(self.output_padding)
            # Patches has shape (out_c, batch, out_h, out_w, in_c, h, w).
            input_shape = other.shape[3:]
            matrix = other
            pieces = self.patches
            if pieces.ndim == 9:
                pieces = pieces.transpose(0, 1)
                pieces = pieces.view(pieces.shape[0], -1, pieces.shape[3], pieces.shape[4], pieces.shape[5]*pieces.shape[6], pieces.shape[7], pieces.shape[8]).transpose(0,1)
            if pieces.ndim == 8:
                pieces = pieces.transpose(0, 1)
                pieces = pieces.view(pieces.shape[0], -1, pieces.shape[3], pieces.shape[4], pieces.shape[5], pieces.shape[6], pieces.shape[7]).transpose(0,1)
            A1_matrix = patches_to_matrix(
                pieces, input_shape, self.stride, self.padding,
                output_shape=self.output_shape, unstable_idx=self.unstable_idx)
            return A1_matrix.transpose(0, 1) + matrix

    def __str__(self):
        return (
                f"Patches(stride={self.stride}, padding={self.padding}, "
                f"output_padding={self.output_padding}, inserted_zeros={self.inserted_zeros}, "
                f"kernel_shape={list(self.patches.shape)}, input_shape={self.input_shape}, "
                f"output_shape={self.output_shape}, unstable_idx={type(self.unstable_idx)})"
        )

    @property
    def device(self):
        if self.patches is not None:
            return self.patches.device
        if self.unstable_idx is not None:
            if isinstance(self.unstable_idx, tuple):
                return self.unstable_idx[0].device
            else:
                return self.unstable_idx.device
        raise RuntimeError("Patches object is unintialized and cannot determine its device.")

    def create_similar(self, patches=None, stride=None, padding=None, identity=None,
                       unstable_idx=None, output_shape=None, inserted_zeros=None, output_padding=None,
                       input_shape=None):
        """
        Create a new Patches object with new patches weights, and keep other properties the same.
        """
        new_patches = self.patches.clone() if patches is None else patches
        new_identity = self.identity if identity is None else identity
        if new_identity and (new_patches is not None):
            raise ValueError("Identity Patches should have .patches property set to 0.")
        return Patches(
            new_patches,
            stride=self.stride if stride is None else stride,
            padding=self.padding if padding is None else padding,
            shape=new_patches.shape,
            identity=new_identity,
            unstable_idx=self.unstable_idx if unstable_idx is None else unstable_idx,
            output_shape=self.output_shape if output_shape is None else output_shape,
            inserted_zeros=self.inserted_zeros if inserted_zeros is None else inserted_zeros,
            output_padding=self.output_padding if output_padding is None else output_padding,
            input_shape=self.input_shape if input_shape is None else input_shape,
        )
    
    def clone(self):
        return self.create_similar()
    
    def detach(self):
        new_obj = Patches(
            patches=self.patches.detach() if self.patches is not None else None,
            stride=self.stride,
            padding=self.padding,
            shape=self.shape,
            identity=self.identity,
            unstable_idx=(
                tuple(idx.detach() for idx in self.unstable_idx)
                if isinstance(self.unstable_idx, tuple)
                else self.unstable_idx.detach()
            ) if self.unstable_idx is not None else None,
            output_shape=self.output_shape,
            inserted_zeros=self.inserted_zeros,
            output_padding=self.output_padding,
            input_shape=self.input_shape,
        )
        return new_obj

    def to_matrix(self, input_shape):
        assert not is_shape_used(self.output_padding)
        return patches_to_matrix(
            self.patches, input_shape, self.stride, self.padding,
            self.output_shape, self.unstable_idx, self.inserted_zeros
        )

    def simplify(self):
        """Merge stride and inserted_zeros; if they are the same they can cancel out."""
        stride = [self.stride, self.stride] if isinstance(self.stride, int) else self.stride
        if (self.inserted_zeros > 0 and self.inserted_zeros + 1 == stride[0] and
                stride[0] == stride[1] and (self.patches.size(-1) % stride[1]) == 0 and (self.patches.size(-2) % stride[0]) == 0):
            # print(f'before simplify: patches={self.patches.size()} padding={self.padding}, stride={self.stride}, output_padding={self.output_padding}, inserted_zeros={self.inserted_zeros}')
            full_stride = [stride[1], stride[1], stride[0], stride[0]]
            # output_padding = tuple(p // s for p, s in zip(output_padding, full_stride))
            padding = unify_shape(self.padding)
            # since inserted_zero will not put zeros to both end, like [x 0 0 x 0 0 x] instead of [x 0 0 x 0 0 x 0 0]
            # when computing the simplified padding, we should view (inserted_zeros-1) padding entries from one end side
            # as part of the inserted_zero matrices (i.e., "consumed")
            consumed_padding = (padding[0], padding[1] - (stride[1] - 1), padding[2], padding[3] - (stride[0] - 1))
            tentative_padding = tuple(p // s - o for p, s, o in zip(consumed_padding, full_stride, unify_shape(self.output_padding)))
            # negative padding is inconvenient
            if all([p >= 0 for p in tentative_padding]):
                remove_zero_start_idx = (padding[2] % stride[0], padding[0] % stride[1])
                self.padding = tentative_padding
                self.patches = remove_zeros(self.patches, self.inserted_zeros, remove_zero_start_idx=remove_zero_start_idx)
                self.stride = 1
                self.inserted_zeros = 0
                self.output_padding = 0
                # print(f'after simplify: patches={self.patches.size()} padding={self.padding}, stride={self.stride}, output_padding={self.output_padding}, inserted_zeros={self.inserted_zeros}')

    def matmul(self, input, patch_abs=False, input_shape=None):
        """
        Broadcast multiplication for patches and a matrix.

        Input shape: (batch_size, in_c, in_h, in_w).
        If the dim of in_c, in_h, in_w = 1, the the input will be expand by given input_shape to support broadcast

        Output shape: [batch_size, unstable_size] when unstable_idx is not None,
                      [batch_size, out_c, out_h, out_w] when unstable_idx is None,
        """

        patches = self.patches
        if patch_abs:
            patches = patches.abs()

        if input_shape is not None:
            # For cases that input only has fewer dimensions like (1, in_c, 1, 1)
            input = input.expand(input_shape)
            # Expand to (batch_size, in_c, in_h, in_w)

        # unfold the input as [batch_size, out_h, out_w, in_c, H, W]
        unfold_input = inplace_unfold(
            input, kernel_size=patches.shape[-2:],
            padding=self.padding, stride=self.stride,
            inserted_zeros=self.inserted_zeros, output_padding=self.output_padding)
        if self.unstable_idx is not None:
            # We need to add a out_c dimension and select from it.
            unfold_input = unfold_input.unsqueeze(0).expand(self.output_shape[1], -1, -1, -1, -1, -1, -1)
            # Shape: [unstable_size, batch_size, in_c, H, W].
            # Here unfold_input will match this shape.
            unfold_input = unfold_input[self.unstable_idx[0], :, self.unstable_idx[1], self.unstable_idx[2]]
            # shape: [batch_size, unstable_size].
            return torch.einsum('sbchw,sbchw->bs', unfold_input, patches)
        else:
            # shape: [batch_size, out_c, out_h, out_w].
            return torch.einsum('bijchw,sbijchw->bsij', unfold_input, patches)

    def create_padding(self, output_shape):
        # patches was not padded, so we need to pad them here.
        # If this layer is followed by a ReLU layer, then the padding was already handled there and there is no need to pad again.
        one_d_unfolded_r = create_valid_mask(
            output_shape, self.patches.device,
            self.patches.dtype,
            self.patches.shape[-2:],
            self.stride,
            self.inserted_zeros,
            self.padding,
            self.output_padding,
            self.unstable_idx if self.unstable_idx else None)
        patches = self.patches * one_d_unfolded_r
        return patches


def compute_patches_stride_padding(input_shape, patches_padding, patches_stride, op_padding, op_stride, inserted_zeros=0, output_padding=0, simplify=True):
    """
    Compute stride and padding after a conv layer with patches mode.
    """
    for p in (patches_padding, patches_stride, op_padding, op_stride):
        assert isinstance(p, int) or (isinstance(p, (list, tuple)) and (len(p) == 2 or len(p) == 4))
    # If p is int, then same padding on all 4 sides.
    # If p is 2-tuple, then it is padding p[0] on both sides of H, p[1] on both sides of W
    # If p is 4-tuple, then it is padding p[2], p[3] on top and bottom sides of H, p[0] and p[1] on left and right sides of W

    # If any of the inputs are not tuple/list, we convert them to tuple.
    full_patch_padding, full_op_padding, full_patch_stride, full_op_stride = [
            (p, p) if isinstance(p, int) else p for p in [patches_padding, op_padding, patches_stride, op_stride]]
    full_patch_padding, full_op_padding, full_patch_stride, full_op_stride = [
            (p[1], p[1], p[0], p[0]) if len(p) == 2 else p for p in [full_patch_padding, full_op_padding, full_patch_stride, full_op_stride]]
    # Compute the new padding and stride after this layer.
    new_padding = tuple(pp * os + op * (inserted_zeros + 1) for pp, op, os in zip(full_patch_padding, full_op_padding, full_op_stride))
    new_stride = tuple(ps * os for ps, os in zip(full_patch_stride, full_op_stride))

    output_padding = unify_shape(output_padding)
    new_output_padding = (output_padding[0],  # Left
          output_padding[1] + inserted_zeros * input_shape[3] % full_op_stride[2],  # Right
          output_padding[2],  # Top
          output_padding[3] + inserted_zeros * input_shape[2] % full_op_stride[0])  # Bottom

    # Merge into a single number if all numbers are identical.
    if simplify:
        if new_padding.count(new_padding[0]) == len(new_padding):
            new_padding = new_padding[0]
        if new_stride.count(new_stride[0]) == len(new_stride):
            new_stride = new_stride[0]

    return new_padding, new_stride, new_output_padding


def patches_to_matrix(pieces, input_shape, stride, padding, output_shape=None,
                      unstable_idx=None, inserted_zeros=0):
    """Converting a Patches piece into a full dense matrix."""

    # torch.as_strided may cause unpredictable error under deterministic mode,
    # so we temporarily disable it.
    deterministic = torch.are_deterministic_algorithms_enabled()
    torch.use_deterministic_algorithms(False)

    if type(padding) == int:
        padding = (padding, padding, padding, padding)

    if pieces.ndim == 9:
        # Squeeze two additional dimensions for output and input respectively
        assert pieces.shape[1] == 1 and pieces.shape[5] == 1
        pieces = pieces.reshape(
            pieces.shape[0], *pieces.shape[2:5],
            *pieces.shape[6:]
        )

    if unstable_idx is None:
        assert pieces.ndim == 7
        # Non-sparse pieces, with shape (out_c, batch, out_h, out_w, c, h, w).
        output_channel, batch_size, output_x, output_y = pieces.shape[:4]
    else:
        batch_size = pieces.shape[1]
        output_channel, output_x, output_y = output_shape[1:]
    input_channel, kernel_x, kernel_y = pieces.shape[-3:]
    input_x, input_y = input_shape[-2:]

    if inserted_zeros > 0:
        input_x, input_y = (input_x - 1) * (inserted_zeros + 1) + 1, (input_y - 1) * (inserted_zeros + 1) + 1

    if unstable_idx is None:
        # Fix all patches in a full A matrix.
        A_matrix = torch.zeros(batch_size, output_channel, output_x, output_y, input_channel, (input_x + padding[2] + padding[3]) * (input_y + padding[0] + padding[1]), device=pieces.device, dtype=pieces.dtype)
        # Save its orignal stride.
        orig_stride = A_matrix.stride()
        # This is the main trick - we create a *view* of the original matrix, and it contains all sliding windows for the convolution.
        # Since we only created a view (in fact, only metadata of the matrix changed), it should be very efficient.
        matrix_strided = torch.as_strided(A_matrix, [batch_size, output_channel, output_x, output_y, output_x, output_y, input_channel, kernel_x, kernel_y], [orig_stride[0], orig_stride[1], orig_stride[2], orig_stride[3], (input_x + padding[2] + padding[3]) * stride, stride, orig_stride[4], input_y + padding[0] + padding[1], 1])
        # Now we need to fill the conv kernel parameters into the last three dimensions of matrix_strided.
        first_indices = torch.arange(output_x * output_y, device=pieces.device)
        second_indices = torch.div(first_indices, output_y, rounding_mode="trunc")
        third_indices = torch.fmod(first_indices, output_y)
        # pieces have shape (out_c, batch, out_h, out_w, c, h, w).
        pieces = pieces.transpose(0, 1)   # pieces has the out_c dimension at the front, need to move it to the second.
        matrix_strided[:,:,second_indices,third_indices,second_indices,third_indices,:,:,:] = pieces.reshape(*pieces.shape[:2], -1, *pieces.shape[4:])
        A_matrix = A_matrix.view(batch_size, output_channel * output_x * output_y, input_channel, input_x + padding[2] + padding[3], input_y + padding[0] + padding[1])
    else:
        # Fill only a selection of patches.
        # Create only a partial A matrix.
        unstable_size = unstable_idx[0].numel()
        A_matrix = torch.zeros(batch_size, unstable_size, input_channel, (input_x + padding[2] + padding[3]) * (input_y + padding[0] + padding[1]), device=pieces.device, dtype=pieces.dtype)
        # Save its orignal stride.
        orig_stride = A_matrix.stride()
        # This is the main trick - we create a *view* of the original matrix, and it contains all sliding windows for the convolution.
        # Since we only created a view (in fact, only metadata of the matrix changed), it should be very efficient.
        matrix_strided = torch.as_strided(A_matrix, [batch_size, unstable_size, output_x, output_y, input_channel, kernel_x, kernel_y], [orig_stride[0], orig_stride[1], (input_x + padding[2] + padding[3]) * stride, stride, orig_stride[2], input_y + padding[0] + padding[1], 1])
        # pieces have shape (unstable_size, batch, c, h, w).
        first_indices = torch.arange(unstable_size, device=pieces.device)
        matrix_strided[:,first_indices,unstable_idx[1],unstable_idx[2],:,:,:] = pieces.transpose(0, 1).to(matrix_strided)
        A_matrix = A_matrix.view(batch_size, unstable_size, input_channel, input_x + padding[2] + padding[3], input_y + padding[0] + padding[1])

    A_matrix = A_matrix[:,:,:,padding[2]:input_x + padding[2],padding[0]:input_y + padding[0]]

    if inserted_zeros > 0:
        A_matrix = A_matrix[:,:,:, ::(inserted_zeros+1), ::(inserted_zeros+1)]

    # Re-enable deterministic if needed.
    torch.use_deterministic_algorithms(deterministic)

    return A_matrix


def check_patch_biases(lb, ub, lower_b, upper_b):
    # When we use patches mode, it's possible that we need to add two bias
    # one is from the Tensor mode and one is from the patches mode
    # And we need to detect this case and reshape the bias
    if lower_b.ndim < lb.ndim:
        lb = lb.transpose(0,1).reshape(lb.size(1), lb.size(0), -1)
        lb = lb.expand(lb.size(0), lb.size(1), lower_b.size(0)//lb.size(1))
        lb = lb.reshape(lb.size(0), -1).t()
        ub = ub.transpose(0,1).reshape(ub.size(1), ub.size(0), -1)
        ub = ub.expand(ub.size(0), ub.size(1), upper_b.size(0)//ub.size(1))
        ub = ub.reshape(ub.size(0), -1).t()
    elif lower_b.ndim > lb.ndim:
        lower_b = lower_b.transpose(0, 1).reshape(lower_b.size(1), -1).t()
        upper_b = upper_b.transpose(0, 1).reshape(upper_b.size(1), -1).t()
    return lb, ub, lower_b, upper_b


def inplace_unfold(image, kernel_size, stride=1, padding=0, inserted_zeros=0, output_padding=0):
    # Image has size (batch_size, channel, height, width).
    assert image.ndim == 4
    if isinstance(kernel_size, int):
        kernel_size = (kernel_size, kernel_size)
    if isinstance(padding, int):
        padding = (padding, padding, padding, padding)  # (left, right, top, bottom).
    if len(padding) == 2:  # (height direction, width direction).
        padding = (padding[1], padding[1], padding[0], padding[0])
    if isinstance(output_padding, int):
        output_padding = (output_padding, output_padding, output_padding, output_padding)  # (left, right, top, bottom).
    if len(output_padding) == 2:  # (height direction, width direction).
        output_padding = (output_padding[1], output_padding[1], output_padding[0], output_padding[0])
    if isinstance(stride, int):
        stride = (stride, stride)  # (height direction, width direction).
    assert len(kernel_size) == 2 and len(padding) == 4 and len(stride) == 2
    # Make sure the image is large enough for the kernel.
    assert image.size(2) + padding[2] + padding[3] >= kernel_size[0] and image.size(3) + padding[0] + padding[1] >= kernel_size[1]
    if inserted_zeros > 0:
        # We first need to insert zeros in the image before unfolding.
        image = insert_zeros(image, inserted_zeros)
        # padding = (padding[0], padding[1] + 1, padding[2], padding[3] + 1)
    # Compute the number of patches.
    # Formulation: https://pytorch.org/docs/stable/generated/torch.nn.Unfold.html#torch.nn.Unfold
    patches_h = int((image.size(2) + padding[2] + padding[3] - (kernel_size[0] - 1) - 1) / stride[0] + 1)
    patches_w = int((image.size(3) + padding[0] + padding[1] - (kernel_size[1] - 1) - 1) / stride[1] + 1)
    # Pad image.
    if sum(padding) != 0:
        image = torch.nn.functional.pad(image, padding)
    # Save its orignal stride.
    image_stride = image.stride()
    matrix_strided = torch.as_strided(image, [
        # Shape of the output matrix.
        image.size(0),  # Batch size.
        patches_h,  # indices for each patch.
        patches_w,
        image.size(1),  # Channel.
        kernel_size[0],   # indices for each pixel on a patch.
        kernel_size[1]], [
        # Stride of the output matrix.
        image_stride[0],  # Batch size dimension, keep using the old stride.
        image_stride[2] * stride[0],  # Move patch in the height dimension.
        image_stride[3] * stride[1],  # Move patch in the width dimension.
        image_stride[1],  # Move to the next channel.
        image_stride[2],  # Move to the next row.
        image_stride[3]])  # Move a pixel (on the width direction).
    # Output shape is (batch_size, patches_h, patches_w, channel, kernel_height, kernel_width)
    if sum(output_padding) > 0:
      output_padding = tuple(p if p > 0 else None for p in output_padding)
      matrix_strided = matrix_strided[:, output_padding[2]:-output_padding[3] if output_padding[3] is not None else None,
                                      output_padding[0]:-output_padding[1] if output_padding[1] is not None else None, :, :, :]
    return matrix_strided


def maybe_unfold_patches(d_tensor, last_A, alpha_lookup_idx=None):
    """
    Utility function to handle patch mode bound propagation in activation functions.
    In patches mode, we need to unfold lower and upper slopes (as input "d_tensor").
    In matrix mode we simply return.
    """
    if d_tensor is None or last_A is None or isinstance(last_A, Tensor):
        return d_tensor

    # Shape for d_tensor:
    #   sparse: [spec, batch, in_c, in_h, in_w]
    #   non-sparse (partially shared): [out_c, batch, in_c, in_h, in_w]
    #   non-sparse (not shared): [out_c*out_h*out_w, batch, in_c, in_h, in_w]
    #   shared (independent of output spec): [1, batch, in_c, in_h, in_w]
    # The in_h, in_w dimensions must be unfolded as patches.
    origin_d_shape = d_tensor.shape
    if d_tensor.ndim == 6:
        # Merge the (out_h, out_w) dimensions.
        d_tensor = d_tensor.view(*origin_d_shape[:2], -1, *origin_d_shape[-2:])
    d_shape = d_tensor.size()
    # Reshape to 4-D tensor to unfold.
    d_tensor = d_tensor.view(-1, *d_tensor.shape[-3:])
    # unfold the slope matrix as patches. Patch shape is [spec * batch, out_h, out_w, in_c, H, W).
    d_unfolded = inplace_unfold(
        d_tensor, kernel_size=last_A.patches.shape[-2:], stride=last_A.stride,
        padding=last_A.padding, inserted_zeros=last_A.inserted_zeros,
        output_padding=last_A.output_padding)
    # Reshape to the original shape of d, e.g., for non-sparse it is (out_c, batch, out_h, out_w, in_c, H, W).
    d_unfolded_r = d_unfolded.view(*d_shape[:-3], *d_unfolded.shape[1:])
    if last_A.unstable_idx is not None:
        # Here we have d for all output neurons, but we only need to select unstable ones.
        if d_unfolded_r.size(0) == 1 and alpha_lookup_idx is None:
            # Shared alpha, spasre alpha should not be used.
            # Note: only d_unfolded_r.size(0) == 1 cannot judge that it is a shared alpha,
            #   since the activation may have no unstable neuron at all so
            #   the first dim = 1 + # unstable neuron still equals to 1
            if len(last_A.unstable_idx) == 3:
                # Broadcast the spec shape, so only need to select the rest dimensions.
                # Change shape to (out_h, out_w, batch, in_c, H, W) or (out_h, out_w, in_c, H, W).
                d_unfolded_r = d_unfolded_r.squeeze(0).permute(1, 2, 0, 3, 4, 5)
                d_unfolded_r = d_unfolded_r[last_A.unstable_idx[1], last_A.unstable_idx[2]]
            elif len(last_A.unstable_idx) == 4:
                # [spec, batch, output_h, output_w, input_c, H, W]
                # to [output_h, output_w, batch, in_c, H, W]
                d_unfolded_r = d_unfolded_r.squeeze(0).permute(1, 2, 0, 3, 4, 5)
                d_unfolded_r = d_unfolded_r[last_A.unstable_idx[2], last_A.unstable_idx[3]]
            else:
                raise NotImplementedError()
            # output shape: (unstable_size, batch, in_c, H, W).
        else:
            # The spec dimension may be sparse and contains unstable neurons for the spec layer only.
            if alpha_lookup_idx is None:
                # alpha is spec-dense. Possible because the number of unstable neurons may decrease.
                if last_A.output_shape[1] == d_unfolded_r.size(0):
                    # Non spec-sparse, partially shared alpha among output channel dimension.
                    # Shape after unfolding is (out_c, batch, out_h, out_w, in_c, patch_h, patch_w).
                    d_unfolded_r = d_unfolded_r[last_A.unstable_idx[0], :, last_A.unstable_idx[1], last_A.unstable_idx[2]]
                else:
                    # Non spec-sparse, non-shared alpha.
                    # Shape after unfolding is (out_c*out_h*out_w, batch, out_h, out_w, in_c, patch_h, patch_w).
                    # Reshaped to (out_c, out_h, out_w, batch, out_h, out_w, in_c, patch_h, patch_w).
                    d_unfolded_r = d_unfolded_r.view(last_A.shape[0], last_A.shape[2], last_A.shape[3], -1, *d_unfolded_r.shape[2:])
                    # Select on all out_c, out_h, out_w dimensions.
                    d_unfolded_r = d_unfolded_r[last_A.unstable_idx[0], last_A.unstable_idx[1],
                            last_A.unstable_idx[2], :, last_A.unstable_idx[1], last_A.unstable_idx[2]]
            elif alpha_lookup_idx.ndim == 1:
                # sparse alpha: [spec, batch, in_c, in_h, in_w]
                # Partially shared alpha on the spec dimension - all output neurons on the same channel use the same alpha.
                # If alpha_lookup_idx is not None, we need to convert the sparse indices using alpha_lookup_idx.
                _unstable_idx = alpha_lookup_idx[last_A.unstable_idx[0]]
                # The selection is only used on the channel dimension.
                d_unfolded_r = d_unfolded_r[_unstable_idx, :, last_A.unstable_idx[1], last_A.unstable_idx[2]]
            elif alpha_lookup_idx is not None and alpha_lookup_idx.ndim == 3:
                # sparse alpha: [spec, batch, in_c, in_h, in_w]
                # We created alpha as full output shape; alpha not shared among channel dimension.
                # Shape of alpha is (out_c*out_h*out_w, batch, in_c, in_h, in_w), note that the first 3 dimensions
                # is merged into one to allow simpler selection.
                _unstable_idx = alpha_lookup_idx[
                    last_A.unstable_idx[0],
                    last_A.unstable_idx[1],
                    last_A.unstable_idx[2]]
                # d_unfolded_r shape from (out_c, batch, out_h, out_w, in_c, in_h, in_w)
                # to (out_c * out_h * out_w(sparse), batch, in_c, in_h, in_w)
                # Note that the dimensions out_h, out_w come from unfolding, not specs in alpha, so they will be selected
                # directly without translating using the lookup table.
                d_unfolded_r = d_unfolded_r[_unstable_idx, :, last_A.unstable_idx[1], last_A.unstable_idx[2]]
                # after selection we return (unstable_size, batch_size, in_c, H, W)
                return d_unfolded_r
            else:
                raise ValueError
    else:
        # A is not sparse. Alpha shouldn't be sparse as well.
        assert alpha_lookup_idx is None
        if last_A.patches.size(0) != d_unfolded_r.size(0) and d_unfolded_r.size(0) != 1:
            # Non-shared alpha, shape after unfolding is (out_c*out_h*out_w, batch, out_h, out_w, in_c, patch_h, patch_w).
            # Reshaped to (out_c, out_h*out_w, batch, out_h*out_w, in_c, patch_h, patch_w).
            d_unfolded_r = d_unfolded_r.reshape(last_A.shape[0], last_A.shape[2] * last_A.shape[3], -1,
                    d_unfolded_r.shape[2] * d_unfolded_r.shape[3], *d_unfolded_r.shape[4:])
            # Select the "diagonal" elements in the out_h*out_w dimension.
            # New shape is (out_c, batch, in_c, patch_h, patch_w, out_h*out_w)
            d_unfolded_r = d_unfolded_r.diagonal(offset=0, dim1=1, dim2=3)
            # New shape is (out_c, batch, in_c, patch_h, patch_w, out_h, out_w)
            d_unfolded_r = d_unfolded_r.view(*d_unfolded_r.shape[:-1], last_A.shape[2], last_A.shape[3])
            # New shape is (out_c, batch, out_h, out_w, in_c, patch_h, patch_w)
            d_unfolded_r = d_unfolded_r.permute(0, 1, 5, 6, 2, 3, 4)


    # For sparse patches, the shape after unfold is (unstable_size, batch_size, in_c, H, W).
    # For regular patches, the shape after unfold is (out_c, batch, out_h, out_w, in_c, H, W).
    if d_unfolded_r.ndim != last_A.patches.ndim:
        # For the situation of d independent of output neuron (e.g., vanilla crown bound), which does not have
        # the out_h, out_w dimension and out_c = 1 (sepc). We added 1s for the out_h, out_w dimensions.
        d_unfolded_r = d_unfolded_r.unsqueeze(2).unsqueeze(-4)
    return d_unfolded_r

def create_valid_mask(output_shape, device, dtype, kernel_size, stride, inserted_zeros, padding, output_padding,
                      unstable_idx=None):
    """
        Create a 0-1 mask of patch pieces shape (except batch dim),
        where 1 indicates the cells corresponding to valid image pixels
        Can be used to mask out unused A cells
    :return: tensor of batch pieces shape, containing the binary mask
    """
    one_d = torch.ones(
        tuple(1 for i in output_shape[1:]),
        device=device, dtype=dtype
    ).expand(output_shape[1:])
    # Add batch dimension.
    one_d = one_d.unsqueeze(0)
    # After unfolding, the shape is (1, out_h, out_w, in_c, h, w)
    one_d_unfolded = inplace_unfold(
        one_d, kernel_size=kernel_size,
        stride=stride, padding=padding,
        inserted_zeros=inserted_zeros,
        output_padding=output_padding)
    if unstable_idx is not None:
        # Move out_h, out_w dimension to the front for easier selection.
        ans = one_d_unfolded.permute(1, 2, 0, 3, 4, 5)
        # for sparse patches the shape is (unstable_size, batch, in_c, h, w).
        # Batch size is 1 so no need to select here.
        ans = ans[unstable_idx[1], unstable_idx[2]]
    else:
        # Append the spec dimension.
        ans = one_d_unfolded.unsqueeze(0)
    return ans


================================================
FILE: auto_LiRPA/perturbations.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
import json
import math
import os
import numpy as np
import torch
from .utils import logger, eyeC
from .patches import Patches, patches_to_matrix
from .linear_bound import LinearBound

from .concretize_func import constraints_solving, sort_out_constr_batches, construct_constraints

class Perturbation:
    r"""
    Base class for a perturbation specification. Please see examples
    at `auto_LiRPA/perturbations.py`.

    Examples:

    * `PerturbationLpNorm`: Lp-norm (p>=1) perturbation.

    * `PerturbationL0Norm`: L0-norm perturbation.

    * `PerturbationSynonym`: Synonym substitution perturbation for NLP.
    """

    def __init__(self):
        pass

    def set_eps(self, eps):
        self.eps = eps

    def concretize(self, x, A, sign=-1, aux=None):
        r"""
        Concretize bounds according to the perturbation specification.

        Args:
            x (Tensor): Input before perturbation.

            A (Tensor) : A matrix from LiRPA computation.

            sign (-1 or +1): If -1, concretize for lower bound; if +1, concretize for upper bound.

            aux (object, optional): Auxilary information for concretization.

        Returns:
            bound (Tensor): concretized bound with the shape equal to the clean output.
        """
        raise NotImplementedError

    def init(self, x, aux=None, forward=False):
        r"""
        Initialize bounds before LiRPA computation.

        Args:
            x (Tensor): Input before perturbation.

            aux (object, optional): Auxilary information.

            forward (bool): It indicates whether forward mode LiRPA is involved.

        Returns:
            bound (LinearBound): Initialized bounds.

            center (Tensor): Center of perturbation. It can simply be `x`, or some other value.

            aux (object, optional): Auxilary information. Bound initialization may modify or add auxilary information.
        """

        raise NotImplementedError


class PerturbationL0Norm(Perturbation):
    """Perturbation constrained by the L_0 norm.

    Assuming input data is in the range of 0-1.
    """

    def __init__(self, eps, x_L=None, x_U=None, ratio=1.0):
        self.eps = eps
        self.x_U = x_U
        self.x_L = x_L
        self.ratio = ratio

    def concretize(self, x, A, sign=-1, aux=None):
        if A is None:
            return None

        eps = math.ceil(self.eps)
        x = x.reshape(x.shape[0], -1, 1)
        center = A.matmul(x)

        x = x.reshape(x.shape[0], 1, -1)

        original = A * x.expand(x.shape[0], A.shape[-2], x.shape[2])
        neg_mask = A < 0
        pos_mask = A >= 0

        if sign == 1:
            A_diff = torch.zeros_like(A)
            A_diff[pos_mask] = A[pos_mask] - original[pos_mask]# changes that one weight can contribute to the value
            A_diff[neg_mask] = - original[neg_mask]
        else:
            A_diff = torch.zeros_like(A)
            A_diff[pos_mask] = original[pos_mask]
            A_diff[neg_mask] = original[neg_mask] - A[neg_mask]

        # FIXME: this assumes the input pixel range is between 0 and 1!
        A_diff, _= torch.sort(A_diff, dim = 2, descending=True)

        bound = center + sign * A_diff[:, :, :eps].sum(dim = 2).unsqueeze(2) * self.ratio

        return bound.squeeze(2)

    def init(self, x, aux=None, forward=False):
        # For other norms, we pass in the BoundedTensor objects directly.
        x_L = x
        x_U = x
        if not forward:
            return LinearBound(None, None, None, None, x_L, x_U), x, None
        batch_size = x.shape[0]
        dim = x.reshape(batch_size, -1).shape[-1]
        eye = torch.eye(dim).to(x.device).unsqueeze(0).repeat(batch_size, 1, 1)
        lw = eye.reshape(batch_size, dim, *x.shape[1:])
        lb = torch.zeros_like(x).to(x.device)
        uw, ub = lw.clone(), lb.clone()
        return LinearBound(lw, lb, uw, ub, x_L, x_U), x, None

    def __repr__(self):
        return 'PerturbationLpNorm(norm=0, eps={})'.format(self.eps)


class PerturbationLpNorm(Perturbation):
    """Perturbation constrained by the L_p norm."""
    def __init__(self, eps=0, norm=np.inf, x_L=None, x_U=None, eps_min=0, 
                 constraints=None, rearrange_constraints=False, no_return_inf=False, timer=None):
        r"""
        Initialize a p-norm perturbation instance.
        There are two ways to initialize it:
            -- x_L, x_U: (Higher priority)
            -- eps     : (Lower priority)
        If use eps to initialize it, the centroid x (or x0 as in the member attribute) will be
            passed into `init` and `concretize` function.  
        For the shape notations such as 'B' or 'X', please check the shape declaration 
            at the beginning of concretize_func.py

        Args:
            eps (Tensor): The epsilon tensor, it represents the pertubation added to a BoundedTensor.
            norm (int or torch.inf): The p in p-norm perturbation.
            x_L (Tensor): Lower bound of input box, shape (B, *input_shape[1:]).
            x_U (Tensor): Upper bound of input box, shape (B, *input_shape[1:]).
            eps_min ()
            constraints (Tuple[Tensor, Tensor] or None): 
                A tuple `(A, b)` representing per-batch linear constraints.
                - `A`: shape (B, N_constr, X)
                - `b`: shape (B, N_constr)
            rearrange_constraints (bool): 
                Whether to rearrange constraints for better solver performance. Default: False.
            no_return_inf (bool): 
                If True, infeasible batches will be excluded from `active_indices`.
                Otherwise, infeasible batches are still marked active. Default: False.
                Please check `constraints_solving` and `sort_out_constr_batches` for more details.
            timer (Timer):
                A timer recording the concretization time.
        """        
        self.eps = eps
        self.x0 = None

        # For p = inf, pre-compute x0 and eps would accerlerate the concretize function.
        if norm == np.inf and x_L is not None and x_U is not None:
            self.eps = (x_U - x_L) / 2
            self.x0 = (x_U + x_L) / 2
        
        # x0_act and eps_act stands for x0 and eps matrix for batches with active constraints
        self.x0_act = None          # shape (batchsize, *X_shape)
        self.eps_act = None         # shape (batchsize, *X_shape)
        # x0_sparse_act and eps_sparse_act are the active sparse x0 and eps matrix when sparse perturbation is enabled.
        # Check init_sparse_linf to see how sparse x0, eps, x0_act, eps_act are created.
        self.x0_sparse_act = None   # shape (batchsize, *X_sparse_shape)
        self.eps_sparse_act = None  # shape (batchsize, *X_sparse_shape)

        self.eps_min = eps_min
        self.norm = norm
        self.dual_norm = 1 if (norm == np.inf) else (np.float64(1.0) / (1 - 1.0 / self.norm))
        self.x_L = x_L
        self.x_U = x_U
        self.sparse = False

        self.timer = timer
        self.aux_lb = None
        self.aux_ub = None

        self.rearrange_constraints = rearrange_constraints

        # constraints is a tuple containing both the coefficient matrix and bias term
        # of the constraints. The constraints would appear in the form of:
        #                           A_c * x + b_c <= 0
        # Coefficient matrix will be reshaped into (batchsize, # of constraints,
        # input_dim). Bias term will be reshaped into (batchsize, # of constraints)
        # also see in `constraints_solving` in constraints_solver.py

        # Pre-process the constraints.
        self.constraints, self.sorted_out_batches = sort_out_constr_batches(x_L, x_U, constraints, 
                                                                            rearrange_constraints=rearrange_constraints,
                                                                            no_return_inf=no_return_inf)
        # The indices of hidden neurons to apply constraints.
        self.objective_indices = None   # shape: (batchsize, num_of_neurons)
        if self.constraints is None or self.constraints[0].numel() == 0:
            self._constraints_enable = False
        else:
            self._constraints_enable = True
        self.no_return_inf = no_return_inf

        self._use_grad = False

    def get_input_bounds(self, x, A):
        if self.sparse:
            if self.x_L_sparse.shape[-1] == A.shape[-1]:
                x_L, x_U = self.x_L_sparse, self.x_U_sparse
                act_x0, act_eps = self.x0_sparse_act, self.eps_sparse_act
            else:
                # In backward mode, A is not sparse.
                x_L, x_U = self.x_L, self.x_U
                act_x0, act_eps = self.x0_act, self.eps_act
        else:
            x_L = x - self.eps if self.x_L is None else self.x_L
            x_U = x + self.eps if self.x_U is None else self.x_U
            act_x0, act_eps = self.x0_act, self.eps_act
        return x_L, x_U, act_x0, act_eps

    def get_constraints(self, A):
        if self.constraints is None:
            return None
        if self.sparse and self.x_L_sparse.shape[-1] == A.shape[-1]:
            return self.constraints_sparse
        else:
            return self.constraints

    def concretize_matrix(self, x, A, sign, constraints=None):
        # If A is an identity matrix, we will handle specially.
        if not isinstance(A, eyeC):
            # A has (Batch, spec, *input_size). For intermediate neurons, spec is *neuron_size.
            A = A.reshape(A.shape[0], A.shape[1], -1)

        if self.norm == np.inf:
            x_L, x_U, act_x0, act_eps = self.get_input_bounds(x, A)
            if constraints is None:
                constraints = self.get_constraints(A)
            # The original code for matrix concretize has been merged into `constraints_solving`.
            # Pick out auxiliary bound based on the sign.
            aux_bounds = self.aux_lb if sign == -1.0 else self.aux_ub
            results = constraints_solving(x_L, x_U, A, constraints, sign,
                                        sorted_out_batches=self.sorted_out_batches, objective_indices=self.objective_indices, 
                                        constraints_enable=self._constraints_enable, no_return_inf=self.no_return_inf,
                                        timer=self.timer, 
                                        aux_bounds=aux_bounds, act_x0=act_x0, act_eps=act_eps,
                                        use_grad=self._use_grad)
            
            if self.no_return_inf:
                # return: (bound, infeasible_bounds)
                bound = results[0]
                infeasible_bounds = results[1]
                self.add_infeasible_batches(infeasible_bounds)
            else:
                # return: bound
                bound = results
        else:
            x = x.reshape(x.shape[0], -1, 1)
            if not isinstance(A, eyeC):
                # Find the upper and lower bounds via dual norm.
                deviation = A.norm(self.dual_norm, -1) * self.eps
                bound = A.matmul(x) + sign * deviation.unsqueeze(-1)
            else:
                # A is an identity matrix. Its norm is all 1.
                bound = x + sign * self.eps
        bound = bound.squeeze(-1)
        return bound

    def concretize_patches(self, x, A, sign):
        if self.norm == np.inf:
            x_L, x_U, _, _,  = self.get_input_bounds(x, A)

            # Here we should not reshape
            # Find the uppwer and lower bound similarly to IBP.
            center = (x_U + x_L) / 2.0
            diff = (x_U - x_L) / 2.0

            if not A.identity == 1:
                bound = A.matmul(center)
                bound_diff = A.matmul(diff, patch_abs=True)
                if sign == 1:
                    bound += bound_diff
                elif sign == -1:
                    bound -= bound_diff
                else:
                    raise ValueError("Unsupported Sign")
            else:
                # A is an identity matrix. No need to do this matmul.
                bound = center + sign * diff
            return bound
        else:  # Lp norm
            input_shape = x.shape
            if not A.identity:
                # Find the upper and lower bounds via dual norm.
                # matrix has shape
                # (batch_size, out_c * out_h * out_w, input_c, input_h, input_w)
                # or (batch_size, unstable_size, input_c, input_h, input_w)
                matrix = patches_to_matrix(
                    A.patches, input_shape, A.stride, A.padding, A.output_shape,
                    A.unstable_idx)
                # Note that we should avoid reshape the matrix.
                # Due to padding, matrix cannot be reshaped without copying.
                deviation = matrix.norm(p=self.dual_norm, dim=(-3,-2,-1)) * self.eps
                # Bound has shape (batch, out_c * out_h * out_w) or (batch, unstable_size).
                bound = torch.einsum('bschw,bchw->bs', matrix, x) + sign * deviation
                if A.unstable_idx is None:
                    # Reshape to (batch, out_c, out_h, out_w).
                    bound = bound.view(matrix.size(0), A.patches.size(0),
                                       A.patches.size(2), A.patches.size(3))
            else:
                # A is an identity matrix. Its norm is all 1.
                bound = x + sign * self.eps
            return bound

    def concretize(self, x, A, sign=-1, constraints=None, aux=None):
        """Given an variable x and its bound matrix A, compute worst case bound according to Lp norm."""
        if A is None:
            return None
        if isinstance(A, eyeC) or isinstance(A, torch.Tensor):
            ret = self.concretize_matrix(x, A, sign, constraints)
        elif isinstance(A, Patches):
            ret = self.concretize_patches(x, A, sign)
        else:
            raise NotImplementedError()
        if ret.ndim > 2:
            ret = ret.reshape(A.shape[1], -1)
        return ret

    def init_sparse_linf(self, x, x_L, x_U):
        """ Sparse Linf perturbation where only a few dimensions are actually perturbed"""
        self.sparse = True
        batch_size = x_L.shape[0]
        perturbed = (x_U > x_L).int()
        logger.debug(f'Perturbed: {perturbed.sum()}')
        lb = ub = x_L * (1 - perturbed) # x_L=x_U holds when perturbed=0
        perturbed = perturbed.view(batch_size, -1)
        index = torch.cumsum(perturbed, dim=-1)
        dim = max(perturbed.view(batch_size, -1).sum(dim=-1).max(), 1)
        self.x_L_sparse = torch.zeros(batch_size, dim + 1).to(x_L)
        self.x_L_sparse.scatter_(dim=-1, index=index, src=(x_L - lb).view(batch_size, -1), reduce='add')
        self.x_U_sparse = torch.zeros(batch_size, dim + 1).to(x_U)
        self.x_U_sparse.scatter_(dim=-1, index=index, src=(x_U - ub).view(batch_size, -1), reduce='add')
        self.x_L_sparse, self.x_U_sparse = self.x_L_sparse[:, 1:], self.x_U_sparse[:, 1:]
        
        # --- create x0 and eps for Lp Norm
        self.x0_sparse = (self.x_L_sparse + self.x_U_sparse) / 2
        self.eps_sparse = (self.x_U_sparse - self.x_L_sparse) / 2
        if self.sorted_out_batches is not None:
            active_indices = self.sorted_out_batches["active_indices"]
            self.x0_sparse_act = self.x0_sparse[active_indices].unsqueeze(-1)
            self.eps_sparse_act = self.eps_sparse[active_indices].unsqueeze(-1)

        lw = torch.zeros(batch_size, dim + 1, perturbed.shape[-1], device=x.device)
        perturbed = perturbed.to(torch.get_default_dtype())
        lw.scatter_(dim=1, index=index.unsqueeze(1), src=perturbed.unsqueeze(1))
        lw = uw = lw[:, 1:, :].view(batch_size, dim, *x.shape[1:])
        print(f'Using Linf sparse perturbation. Perturbed dimensions: {dim}.')
        print(f'Avg perturbation: {(self.x_U_sparse - self.x_L_sparse).mean()}')

        # When sparse linf is enabled, the input x perturbation would change its shape
        # Hence, the shape of constraints_A should change accordingly.
        # But for the final layer, we still need the dense linf, and use the original (dense) constraints
        if self.constraints is not None:
            # constraints_A: (batchsize, n_constraints, x_dim)
            constraints_A, constraints_b = self.constraints
            # reversed_lw: (batchsize, x_dim, sparse_dim)
            reversed_lw = lw.reshape((batch_size, dim, -1)).transpose(1, 2)
            lb_act = lb
            # When pre-processing the constraints, we only kept the active ones.
            # Hence, reversed_lw and lb_act should also be re-collected.
            active_indices = self.sorted_out_batches["active_indices"]
            reversed_lw = reversed_lw[active_indices]
            lb_act = lb_act[active_indices]
            # reversed lw will sort out the sparse dimensions out of all x dimension
            new_constraints_A = constraints_A.bmm(reversed_lw)
            # Besides original constraint_b, should also include the a*x terms where x is not perturbed
            # new_constraints_b = constraints_b + torch.einsum("bcx, bx -> bc", constraints_A, lb_act.flatten(1))
            new_constraints_b = constraints_b
            self.constraints_sparse = (new_constraints_A, new_constraints_b)
        return LinearBound(
            lw, lb, uw, ub, x_L, x_U), x, None

    def init(self, x, aux=None, forward=False):
        self.sparse = False
        if self.norm == np.inf:
            x_L = x - self.eps if self.x_L is None else self.x_L
            x_U = x + self.eps if self.x_U is None else self.x_U
        else:
            if int(os.environ.get('AUTOLIRPA_L2_DEBUG', 0)) == 1:
                # FIXME Experimental code. Need to change the IBP code also.
                x_L = x - self.eps if self.x_L is None else self.x_L
                x_U = x + self.eps if self.x_U is None else self.x_U
            else:
                # FIXME This causes confusing lower bound and upper bound
                # For other norms, we pass in the BoundedTensor objects directly.
                x_L = x_U = x

        if self.x_L is not None and self.x_U is not None:
            self.x0 = (self.x_L + self.x_U) / 2
        else:
            self.x0 = x.data
        if self.sorted_out_batches is not None and self.sorted_out_batches.get("active_indices") is not None:
            active_indices = self.sorted_out_batches["active_indices"]
            self.x0_act = self.x0[active_indices].flatten(1).unsqueeze(-1)
            self.eps_act = self.eps[active_indices].flatten(1).unsqueeze(-1)

        if not forward:
            return LinearBound(
                None, None, None, None, x_L, x_U), x, None
        if (self.norm == np.inf and x_L.numel() > 1
                and (x_L == x_U).sum() > 0.5 * x_L.numel()):
            return self.init_sparse_linf(x, x_L, x_U)

        batch_size = x.shape[0]
        dim = x.reshape(batch_size, -1).shape[-1]
        lb = ub = torch.zeros_like(x)
        eye = torch.eye(dim).to(x).expand(batch_size, dim, dim)
        lw = uw = eye.reshape(batch_size, dim, *x.shape[1:])
        return LinearBound(
            lw, lb, uw, ub, x_L, x_U), x, None

    def add_infeasible_batches(self, infeasible_batches):
        r"""
        Synchronize the `infeasible_batches` tensor between the global graph and the local perturbation node.

        If the computation graph includes multiple perturbed inputs, the BoundedModule (entire network) maintains a global
        `infeasible_batches` tensor, while each perturbed input (root) keeps its own local copy.

        - Before concretization: copy the global tensor to the local one.
        - After concretization: propagate updates from the local tensor back to the global tensor.

        Args:
            infeasible_batches: A boolean vector with shape (batchsize, ). A True value indicates that a batch is infeasible
                                given its constraints.
        """
        if self.constraints is not None and infeasible_batches is not None and infeasible_batches.any():
            if self.sorted_out_batches["infeasible_batches"] is None:
                self.sorted_out_batches["infeasible_batches"] = infeasible_batches
            else:
                infeasible_batches = infeasible_batches | self.sorted_out_batches["infeasible_batches"]
                self.sorted_out_batches["infeasible_batches"] = infeasible_batches
            
            active_indices = self.sorted_out_batches["active_indices"]
            B_act = active_indices.numel()
            active_feasible_mask = (~infeasible_batches)[active_indices]
            if active_feasible_mask.sum() < B_act:
                self.sorted_out_batches["active_indices"] = active_indices[active_feasible_mask]
                self.x0_act = self.x0_act[active_feasible_mask]
                self.eps_act = self.eps_act[active_feasible_mask]
                constraints_A, constraints_b = self.constraints
                constraints_A = constraints_A[active_feasible_mask]
                constraints_b = constraints_b[active_feasible_mask]
                self.constraints = (constraints_A, constraints_b)

    def add_objective_indices(self, objective_indices):
        if self.constraints is not None:
            self.objective_indices = objective_indices

    @property
    def constraints_enable(self):
        '''
        Enable / Disable the constrained concretize mode, regardless whether constraints is None or not. 
        '''
        return self._constraints_enable
    
    @constraints_enable.setter
    def constraints_enable(self, enable: bool):
        self._constraints_enable = enable

    @constraints_enable.deleter
    def constraints_enable(self):
        del self._constraints_enable  

    @property
    def use_grad(self):
        '''
        Enable / Disable the constrained concretize with gradient. 
        '''
        return self._use_grad
    
    @use_grad.setter
    def use_grad(self, use_grad: bool):
        self._use_grad = use_grad

    @use_grad.deleter
    def use_grad(self):
        del self._use_grad  

    def add_aux_bounds(self, aux_lb, aux_ub):
        self.aux_lb = aux_lb
        self.aux_ub = aux_ub

    def clear_aux_bounds(self):
        self.aux_lb = None
        self.aux_ub = None

    def reset_constraints(self, constraints, decision_thresh):
        r"""
        Reset the constraints of this perturbation. Also will call `sort_out_constr_batches` to preprocess the constraints.
        Be sure not to reset with the same constraints input repeatedly.
        """
        # We have to enable the gradient computation for the constraints
        # when using constraints_solving within alpha crown.
        self.use_grad = True
        constraints = construct_constraints(constraints[0], constraints[1], decision_thresh, self.x_L.shape[0], self.x_L.flatten(1).shape[1])
        self.constraints, self.sorted_out_batches = sort_out_constr_batches(self.x_L, self.x_U, constraints, 
                                                                            rearrange_constraints=self.rearrange_constraints,
                                                                            no_return_inf=self.no_return_inf)

    def __repr__(self):
        if self.norm == np.inf:
            if self.x_L is None and self.x_U is None:
                return f'PerturbationLpNorm(norm=inf, eps={self.eps})'
            else:
                return f'PerturbationLpNorm(norm=inf, eps={self.eps}, x_L={self.x_L}, x_U={self.x_U})'
        else:
            return f'PerturbationLpNorm(norm={self.norm}, eps={self.eps})'


class PerturbationLinear(Perturbation):
    """
    Perturbation defined by a Linear transformation.
    args:
        lower_A: Lower bound matrix of shape (B, output_dim, input_dim)
        upper_A: Upper bound matrix of shape (B, output_dim, input_dim)
        lower_b: Lower bound bias of shape (B, output_dim)
        upper_b: Upper bound bias of shape (B, output_dim)
        input_lb: Input lower bound of shape (B, input_dim)
        input_ub: Input upper bound of shape (B, input_dim)
        x_L: Output lower bound of shape (B, output_dim)
        x_U: Output upper bound of shape (B, output_dim)

        x_L and x_U can be None, in which case they will be computed from the other parameters.    
    """
    def __init__(self, lower_A, upper_A, lower_b, upper_b, input_lb, input_ub, x_L=None, x_U=None):
        super(PerturbationLinear, self).__init__()
        self.lower_A = lower_A
        self.upper_A = upper_A
        self.lower_b = lower_b.unsqueeze(-1) if lower_b is not None else None
        self.upper_b = upper_b.unsqueeze(-1) if upper_b is not None else None
        self.input_lb = input_lb.unsqueeze(-1) if input_lb is not None else None
        self.input_ub = input_ub.unsqueeze(-1) if input_ub is not None else None
        if x_L is None or x_U is None:
            mid = (self.input_lb + self.input_ub) / 2
            diff = (self.input_ub - self.input_lb) / 2
            self.x_U = (self.upper_A @ mid + torch.abs(self.upper_A) @ diff + self.upper_b).squeeze(-1)
            self.x_L = (self.lower_A @ mid - torch.abs(self.lower_A) @ diff + self.lower_b).squeeze(-1)
        else:
            self.x_L = x_L
            self.x_U = x_U

    def concretize(self, x, A, sign=-1, aux=None):
        if A is None:
            return None
        else:
            A_pos = torch.clamp(A, min=0)
            A_neg = torch.clamp(A, max=0)

            center = (self.input_lb + self.input_ub) / 2
            diff = (self.input_ub - self.input_lb) / 2

            if sign == 1:
                composite_A = A_pos @ self.upper_A + A_neg @ self.lower_A
                composite_b = A_pos @ self.upper_b + A_neg @ self.lower_b
                bound = composite_A @ center + torch.abs(composite_A) @ diff + composite_b
            else:
                composite_A = A_pos @ self.lower_A + A_neg @ self.upper_A
                composite_b = A_pos @ self.lower_b + A_neg @ self.upper_b
                bound = composite_A @ center - torch.abs(composite_A) @ diff + composite_b
            return bound.squeeze(-1)

    def init(self, x, aux=None, forward=False):
        if not forward:
            return LinearBound(None, None, None, None, self.x_L, self.x_U), x, None
        else:
            raise NotImplementedError("Linear perturbation does not support forward mode.")


class PerturbationSynonym(Perturbation):
    def __init__(self, budget, eps=1.0, use_simple=False):
        super(PerturbationSynonym, self).__init__()
        self._load_synonyms()
        self.budget = budget
        self.eps = eps
        self.use_simple = use_simple
        self.model = None
        self.train = False

    def __repr__(self):
        return (f'perturbation(Synonym-based word substitution '
                f'budget={self.budget}, eps={self.eps})')

    def _load_synonyms(self, path='data/synonyms.json'):
        with open(path) as file:
            self.synonym = json.loads(file.read())
        logger.info('Synonym list loaded for {} words'.format(len(self.synonym)))

    def set_train(self, train):
        self.train = train

    def concretize(self, x, A, sign, aux):
        assert(self.model is not None)

        x_rep, mask, can_be_replaced = aux
        batch_size, length, dim_word = x.shape[0], x.shape[1], x.shape[2]
        dim_out = A.shape[1]
        max_num_cand = x_rep.shape[2]

        mask_rep = torch.tensor(can_be_replaced, dtype=torch.get_default_dtype(), device=A.device)

        num_pos = int(np.max(np.sum(can_be_replaced, axis=-1)))
        update_A = A.shape[-1] > num_pos * dim_word
        if update_A:
            bias = torch.bmm(A, (x * (1 - mask_rep).unsqueeze(-1)).reshape(batch_size, -1, 1)).squeeze(-1)
        else:
            bias = 0.
        A = A.reshape(batch_size, dim_out, -1, dim_word)

        A_new, x_new, x_rep_new, mask_new = [], [], [], []
        zeros_A = torch.zeros(dim_out, dim_word, device=A.device)
        zeros_w = torch.zeros(dim_word, device=A.device)
        zeros_rep = torch.zeros(max_num_cand, dim_word, device=A.device)
        zeros_mask = torch.zeros(max_num_cand, device=A.device)
        for t in range(batch_size):
            cnt = 0
            for i in range(0, length):
                if can_be_replaced[t][i]:
                    if update_A:
                        A_new.append(A[t, :, i, :])
                    x_new.append(x[t][i])
                    x_rep_new.append(x_rep[t][i])
                    mask_new.append(mask[t][i])
                    cnt += 1
            if update_A:
                A_new += [zeros_A] * (num_pos - cnt)
            x_new += [zeros_w] * (num_pos - cnt)
            x_rep_new += [zeros_rep] * (num_pos - cnt)
            mask_new += [zeros_mask] * (num_pos - cnt)
        if update_A:
            A = torch.cat(A_new).reshape(batch_size, num_pos, dim_out, dim_word).transpose(1, 2)
        x = torch.cat(x_new).reshape(batch_size, num_pos, dim_word)
        x_rep = torch.cat(x_rep_new).reshape(batch_size, num_pos, max_num_cand, dim_word)
        mask = torch.cat(mask_new).reshape(batch_size, num_pos, max_num_cand)
        length = num_pos

        A = A.reshape(batch_size, A.shape[1], length, -1).transpose(1, 2)
        x = x.reshape(batch_size, length, -1, 1)

        if sign == 1:
            cmp, init = torch.max, -1e30
        else:
            cmp, init = torch.min, 1e30

        init_tensor = torch.ones(batch_size, dim_out).to(x.device) * init
        dp = [[init_tensor] * (self.budget + 1) for i in range(0, length + 1)]
        dp[0][0] = torch.zeros(batch_size, dim_out).to(x.device)

        A = A.reshape(batch_size * length, A.shape[2], A.shape[3])
        Ax = torch.bmm(
            A,
            x.reshape(batch_size * length, x.shape[2], x.shape[3])
        ).reshape(batch_size, length, A.shape[1])

        Ax_rep = torch.bmm(
            A,
            x_rep.reshape(batch_size * length, max_num_cand, x.shape[2]).transpose(-1, -2)
        ).reshape(batch_size, length, A.shape[1], max_num_cand)
        Ax_rep = Ax_rep * mask.unsqueeze(2) + init * (1 - mask).unsqueeze(2)
        Ax_rep_bound = cmp(Ax_rep, dim=-1).values

        if self.use_simple and self.train:
            return torch.sum(cmp(Ax, Ax_rep_bound), dim=1) + bias

        for i in range(1, length + 1):
            dp[i][0] = dp[i - 1][0] + Ax[:, i - 1]
            for j in range(1, self.budget + 1):
                dp[i][j] = cmp(
                    dp[i - 1][j] + Ax[:, i - 1],
                    dp[i - 1][j - 1] + Ax_rep_bound[:, i - 1]
                )
        dp = torch.cat(dp[length], dim=0).reshape(self.budget + 1, batch_size, dim_out)

        return cmp(dp, dim=0).values + bias

    def init(self, x, aux=None, forward=False):
        tokens, batch = aux
        self.tokens = tokens # DEBUG
        assert(len(x.shape) == 3)
        batch_size, length, dim_word = x.shape[0], x.shape[1], x.shape[2]

        max_pos = 1
        can_be_replaced = np.zeros((batch_size, length), dtype=bool)

        self._build_substitution(batch)

        for t in range(batch_size):
            cnt = 0
            candidates = batch[t]['candidates']
            # for transformers
            if tokens[t][0] == '[CLS]':
                candidates = [[]] + candidates + [[]]
            for i in range(len(tokens[t])):
                if tokens[t][i] == '[UNK]' or \
                        len(candidates[i]) == 0 or tokens[t][i] != candidates[i][0]:
                    continue
                for w in candidates[i][1:]:
                    if w in self.model.vocab:
                        can_be_replaced[t][i] = True
                        cnt += 1
                        break
            max_pos = max(max_pos, cnt)

        dim = max_pos * dim_word
        if forward:
            eye = torch.eye(dim_word).to(x.device)
            lw = torch.zeros(batch_size, dim, length, dim_word).to(x.device)
            lb = torch.zeros_like(x).to(x.device)
        word_embeddings = self.model.word_embeddings.weight
        vocab = self.model.vocab
        x_rep = [[[] for i in range(length)] for t in range(batch_size)]
        max_num_cand = 1
        for t in range(batch_size):
            candidates = batch[t]['candidates']
            # for transformers
            if tokens[t][0] == '[CLS]':
                candidates = [[]] + candidates + [[]]
            cnt = 0
            for i in range(length):
                if can_be_replaced[t][i]:
                    word_embed = word_embeddings[vocab[tokens[t][i]]]
                    # positional embedding and token type embedding
                    other_embed = x[t, i] - word_embed
                    if forward:
                        lw[t, (cnt * dim_word):((cnt + 1) * dim_word), i, :] = eye
                        lb[t, i, :] = torch.zeros_like(word_embed)
                    for w in candidates[i][1:]:
                        if w in self.model.vocab:
                            x_rep[t][i].append(
                                word_embeddings[self.model.vocab[w]] + other_embed)
                    max_num_cand = max(max_num_cand, len(x_rep[t][i]))
                    cnt += 1
                else:
                    if forward:
                        lb[t, i, :] = x[t, i, :]
        if forward:
            uw, ub = lw, lb
        else:
            lw = lb = uw = ub = None
        zeros = torch.zeros(dim_word, device=x.device)

        x_rep_, mask = [], []
        for t in range(batch_size):
            for i in range(length):
                x_rep_ += x_rep[t][i] + [zeros] * (max_num_cand - len(x_rep[t][i]))
                mask += [1] * len(x_rep[t][i]) + [0] * (max_num_cand - len(x_rep[t][i]))
        x_rep_ = torch.cat(x_rep_).reshape(batch_size, length, max_num_cand, dim_word)
        mask = torch.tensor(mask, dtype=torch.get_default_dtype(), device=x.device)\
            .reshape(batch_size, length, max_num_cand)
        x_rep_ = x_rep_ * self.eps + x.unsqueeze(2) * (1 - self.eps)

        inf = 1e20
        lower = torch.min(mask.unsqueeze(-1) * x_rep_ + (1 - mask).unsqueeze(-1) * inf, dim=2).values
        upper = torch.max(mask.unsqueeze(-1) * x_rep_ + (1 - mask).unsqueeze(-1) * (-inf), dim=2).values
        lower = torch.min(lower, x)
        upper = torch.max(upper, x)

        return LinearBound(lw, lb, uw, ub, lower, upper), x, (x_rep_, mask, can_be_replaced)

    def _build_substitution(self, batch):
        for example in batch:
            if not 'candidates' in example or example['candidates'] is None:
                candidates = []
                tokens = example['sentence'].strip().lower().split(' ')
                for i in range(len(tokens)):
                    _cand = []
                    if tokens[i] in self.synonym:
                        for w in self.synonym[tokens[i]]:
                            if w in self.model.vocab:
                                _cand.append(w)
                    if len(_cand) > 0:
                        _cand = [tokens[i]] + _cand
                    candidates.append(_cand)
                example['candidates'] = candidates


================================================
FILE: auto_LiRPA/solver_module.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
from .bound_ops import *

from typing import TYPE_CHECKING
if TYPE_CHECKING:
    from .bound_general import BoundedModule


def build_solver_module(self: 'BoundedModule', x=None, C=None, interm_bounds=None,
                        final_node_name=None, model_type="mip", solver_pkg="gurobi", set_input=True):
    r"""build lp/mip solvers in general graph.

    Args:
        x: inputs, a list of BoundedTensor. If set to None, we reuse exisint bounds that
        were previously computed in compute_bounds().
        C (Tensor): The specification matrix that can map the output of the model with an
        additional linear layer. This is usually used for maping the logits output of the
        model to classification margins.
        interm_bounds: if specified, will replace existing intermediate layer bounds.
        Otherwise we reuse exising intermediate bounds.

        final_node_name (String): the name for the target layer to optimize

        solver_pkg (String): the backbone of the solver, default gurobi, also support scipy

    Returns:
        output vars (list): a list of final nodes to optimize
    """
    # self.root_names: list of root node name
    # self.final_name: list of output node name
    # self.final_node: output module
    # <module>.input: a list of input modules of this layer module
    # <module>.solver_vars: a list of gurobi vars of every layer module
    #       list with conv shape if conv layers, otherwise flattened
    # if last layer we need to be careful with:
    #       C: specification matrix
    #       <module>.is_input_perturbed(1)
    if x is not None:
        assert interm_bounds is not None
        # Set the model to use new intermediate layer bounds, ignore the original ones.
        self.set_input(x, interm_bounds=interm_bounds)

    roots = [self[name] for name in self.root_names]

    # create interval ranges for input and other weight parameters
    for i in range(len(roots)):
        # if isinstance(root[i], BoundInput) and not isinstance(root[i], BoundParams):
        if type(roots[i]) is BoundInput:
            # create input vars for gurobi self.model
            if set_input:
                inp_gurobi_vars = self._build_solver_input(roots[i])
        else:
            value = roots[i].forward()
            # regular weights
            roots[i].solver_vars = value

    final = self.final_node() if final_node_name is None else self[final_node_name]

    # backward propagate every layer including last layer
    self._build_solver_general(node=final, C=C, model_type=model_type, solver_pkg=solver_pkg)

    # a list of output solver vars
    return final.solver_vars


def _build_solver_general(self: 'BoundedModule', node: Bound, C=None, model_type="mip",
                          solver_pkg="gurobi"):
    if not hasattr(node, 'solver_vars'):
        if not node.perturbed:
            # if not perturbed, just forward
            node.solver_vars = self.get_forward_value(node)
            return node.solver_vars
        for n in node.inputs:
            self._build_solver_general(n, C=C, model_type=model_type)
        inp = [n_pre.solver_vars for n_pre in node.inputs]
        if C is not None and isinstance(node, BoundLinear) and\
                not node.is_input_perturbed(1) and self.final_name == node.name:
            # when node is the last layer
            # merge the last BoundLinear node with the specification,
            # available when weights of this layer are not perturbed
            solver_vars = node.build_solver(*inp, model=self.solver_model, C=C,
                model_type=model_type, solver_pkg=solver_pkg)
        else:
            solver_vars = node.build_solver(*inp, model=self.solver_model, C=None,
                    model_type=model_type, solver_pkg=solver_pkg)
        # just return output node gurobi vars
        return solver_vars

def _reset_solver_vars(self: 'BoundedModule', node: Bound, iteration=True):
    if hasattr(node, 'solver_vars'):
        del node.solver_vars
    if iteration:
        if hasattr(node, 'inputs'):
            for n in node.inputs:
                self._reset_solver_vars(n)
                
def _reset_solver_model(self: 'BoundedModule'):
    self.solver_model.remove(self.solver_model.getVars())
    self.solver_model.remove(self.solver_model.getConstrs())
    self.solver_model.update()

def _build_solver_input(self: 'BoundedModule', node):
    ## Do the input layer, which is a special case
    assert isinstance(node, BoundInput)
    assert node.perturbation is not None

    if self.solver_model is None:
        self.solver_model = grb.Model()
    # zero var will be shared within the solver model
    zero_var = self.solver_model.addVar(lb=0, ub=0, obj=0, vtype=grb.GRB.CONTINUOUS, name='zero')
    one_var = self.solver_model.addVar(lb=1, ub=1, obj=0, vtype=grb.GRB.CONTINUOUS, name='one')
    neg_one_var = self.solver_model.addVar(lb=-1, ub=-1, obj=0, vtype=grb.GRB.CONTINUOUS, name='neg_one')

    x_L = node.value - node.perturbation.eps if node.perturbation.x_L is None else node.perturbation.x_L
    x_U = node.value + node.perturbation.eps if node.perturbation.x_U is None else node.perturbation.x_U
    x_L = x_L.min(dim=0).values
    x_U = x_U.max(dim=0).values

    input_shape = x_L.shape
    name_array = [f'inp_{idx}' for idx in range(prod(input_shape))]
    inp_gurobi_vars_dict = self.solver_model.addVars(*input_shape, lb=x_L, ub=x_U,
                                                      obj=0, vtype=grb.GRB.CONTINUOUS, name=name_array)

    inp_gurobi_vars = np.empty(input_shape, dtype=object)
    for idx in inp_gurobi_vars_dict:
        inp_gurobi_vars[idx] = inp_gurobi_vars_dict[idx]
    inp_gurobi_vars = inp_gurobi_vars.tolist()
    
    # Flatten the input solver_vars. 
    def flatten(x):
        if isinstance(x, list):
            result = []
            for item in x:
                result.extend(flatten(item))
            return result
        else:
            return [x]

    # Add extra constraints for the inputs if the perturbation norm is not L_inf.
    if node.perturbation.norm != float("inf"):
        if isinstance(inp_gurobi_vars, (list, tuple)):
            flat_inp_gurobi_vars = flatten(inp_gurobi_vars)
        else:
            flat_inp_gurobi_vars = inp_gurobi_vars
        if hasattr(node.value[0], "flatten"):
            flat_node_value = node.value.flatten().tolist()
        else:
            flat_node_value = node.value
        assert len(flat_inp_gurobi_vars) == len(flat_node_value), "The input doesn't match the variables"

        if node.perturbation.norm == 2:
            # For L2 norm, we directly add a quadratic constraint for cplex compatibility.
            # TODO: Compare efficiency with the second method below. If the second method is faster,
            # we should use it for L2 norm by default (when cplex is not used).
            print(f'setup L2 constraint for input with radius {node.perturbation.eps}.')
            quad_expr = grb.QuadExpr()
            for var, val in zip(flat_inp_gurobi_vars, flat_node_value):
                quad_expr.add((var - val) * (var - val))

            self.solver_model.addQConstr(
                quad_expr <= node.perturbation.eps ** 2,
                name="l2_perturbation"
            )
        else:
            print(f'setup Lp constraint for input with radius {node.perturbation.eps}.')
            n = len(flat_inp_gurobi_vars)
            # Create variables to set up the lp constraint.
            # We set input = x0 + delta where delta is under the Lp norm constraint.
            senses = ['='] * n
            delta_vars = self.solver_model.addVars(
                n,
                lb=-grb.GRB.INFINITY,
                ub=grb.GRB.INFINITY,
                name="delta"
            )
            diff = -np.array(flat_node_value)
            vars_list = list(delta_vars.values()) + flat_inp_gurobi_vars
            self.solver_model.update()
            A = np.hstack([np.eye(n), -np.eye(n)])
            # Add constraints input = x0 + delta as delta - input = -x0.
            # Here x0 is "flat_node_value" and input is "flat_inp_gurobi_vars".
            self.solver_model.addMConstr(A, vars_list, senses, diff)
            # Set up the lp constraint here: \| delta \|_p <= eps.
            lp_norm_var = self.solver_model.addVar(
                lb=0, 
                vtype=grb.GRB.CONTINUOUS,
                name="lp_norm"
            )
            self.solver_model.addGenConstrNorm(
                lp_norm_var,
                delta_vars,
                node.perturbation.norm,
                name="lp_norm_constr"
            )
            self.solver_model.addConstr(
                lp_norm_var <= node.perturbation.eps,
                name="lp_perturbation_radius"
            )
    
    node.solver_vars = inp_gurobi_vars
    # Save the gurobi input variables so that we can later extract primal values in input space easily.
    self.input_vars = inp_gurobi_vars
    self.solver_model.update()
    return inp_gurobi_vars


================================================
FILE: auto_LiRPA/tools.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
import torch
from graphviz import Digraph
import shutil
import re

from typing import TYPE_CHECKING, List
if TYPE_CHECKING:
    from .bound_general import BoundedModule


def visualize(self: 'BoundedModule', output_path, print_bounds=False):
    r"""A visualization tool for BoundedModule.
    If dot engine is available in the system enviornment, it renders the graph and output {output_path}.png.
    Otherwise, it output a {output_path}.dot.

    Args:
        output_path (str): The path to save the graph (without file extension).
        print_bounds (bool): Whether to display the mean width of the bounds for each node.
    """

    nodes = list(self.nodes())
    # Create a directed graph
    dot = Digraph(format='png', engine='dot')
    # Add nodes with optional attributes
    for node in nodes:
        # we name the Graphviz nodes with the sanitized node name,
        # while keeping the original name in the label which is displayed in the graph.
        export_node_name = sanitize_graphviz_name(node.name)
        label = f"""<
            <TABLE BORDER="0" CELLBORDER="0" CELLPADDING="4">
                <TR><TD><FONT FACE="Arial" COLOR="black">{node.name}</FONT></TD></TR>
                <TR><TD><FONT FACE="Courier" COLOR="blue">{node.__class__.__name__}</FONT></TD></TR>
                <TR><TD><FONT FACE="Courier" COLOR="black">{
                    tuple(node.output_shape) if node.output_shape is not None else None}</FONT></TD></TR>
            </TABLE>
        >"""
        if print_bounds:
            # Display the mean width of the bounds)
            # (Both the empirical bound from forward value and the computed bound if available)
            label = f"""<
                <TABLE BORDER="0" CELLBORDER="0" CELLPADDING="4">
                    <TR><TD><FONT FACE="Arial" COLOR="black">{node.name}</FONT></TD></TR>
                    <TR><TD><FONT FACE="Courier" COLOR="blue">{node.__class__.__name__}</FONT></TD></TR>
                    <TR><TD><FONT FACE="Courier" COLOR="black">{
                        tuple(node.output_shape) if node.output_shape is not None else None}</FONT></TD></TR>
                    <TR><TD><FONT FACE="Courier" COLOR="black">{
                        (node.forward_value.max(dim=0)[0] - node.forward_value.min(dim=0)[0]).to(dtype=torch.float).mean().item() if (
                            node.perturbed and
                            hasattr(node, "forward_value") and
                            isinstance(node.forward_value, torch.Tensor)) else None}</FONT></TD></TR>
                    <TR><TD><FONT FACE="Courier" COLOR="black">{
                        (node.upper - node.lower).to(dtype=torch.float).mean().item() if (
                            node.perturbed and
                            hasattr(node, "lower") and hasattr(node, "upper") and
                            node.lower is not None and node.upper is not None) else None}</FONT></TD></TR>
                </TABLE>
            >"""
        # perturbed nodes are highlighted in grey
        if getattr(node, "perturbed", False):
            style_attrs = {'style': 'filled', 'fillcolor': 'lightgrey'}
        else:
            style_attrs = {}
        if node.__class__.__name__ in ["BoundParams", "boundConstant", "BoundBuffers"]:
            dot.node(export_node_name, label=label, fontsize="8", width="0.5", height="0.2", shape="ellipse", **style_attrs)
        elif node.__class__.__name__ == "BoundInput":
            dot.node(export_node_name, label=label, shape="diamond", **style_attrs)
        else:
            dot.node(export_node_name, label=label, shape="square", **style_attrs)
        for inp in node.inputs:
            dot.edge(sanitize_graphviz_name(inp.name), export_node_name)
    # Render graph
    if shutil.which("dot") is None:
        print("Cannot render the graphviz file (dot not found).")
        print(f"Graph saved to {output_path}.dot")
        dot.save(output_path + ".dot")
    else:
        dot.render(output_path, cleanup=True)
        print(f"Graph saved to {output_path}.png")

def sanitize_graphviz_name(name):
    """
    Convert problematic characters (like `:`, `::`) in a Graphviz node name to a safe alternative character `_`.
    """
    unsafe_chars = r'[:;,\[\]{}()<>|#*@&=+`~^?"\\\s]'
    safe_name = re.sub(unsafe_chars, "_", name)
    
    return safe_name


================================================
FILE: auto_LiRPA/utils.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
import logging
import time
import torch
import torch.nn as nn
import os
import sys
import appdirs
from collections import defaultdict, namedtuple
from functools import reduce
import operator
import warnings
from typing import Tuple
from .patches import Patches


logging.basicConfig(
    format='%(levelname)-8s %(asctime)-12s [%(filename)s:%(lineno)d] %(message)s',
    datefmt='%H:%M:%S',
    stream=sys.stdout,
    level=logging.INFO
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG if os.environ.get('AUTOLIRPA_DEBUG', 0) else logging.INFO)

warnings.simplefilter("once")

# Special identity matrix. Avoid extra computation of identity matrix multiplication in various places.
eyeC = namedtuple('eyeC', 'shape device')
OneHotC = namedtuple('OneHotC', 'shape device index coeffs')
BatchedCrownC = namedtuple('BatchedCrownC', 'type')

def onehotc_to_dense(one_hot_c: OneHotC, dtype: torch.dtype) -> torch.Tensor:
    shape = one_hot_c.shape  # [spec, batch, C, H, W]
    dim = int(prod(shape[2:]))
    dense = torch.zeros(
        size=(shape[0], shape[1], dim), device=one_hot_c.device, dtype=dtype)
    # one_hot_c.index has size (spec, batch), its values are the index of the one-hot non-zero elements in A.
    # one_hot_c.coeffs is the value of the non-zero element.
    dense = torch.scatter(
        dense, dim=2, index=one_hot_c.index.unsqueeze(-1),
        src=one_hot_c.coeffs.unsqueeze(-1))
    dense = dense.view(shape[0], shape[1], *shape[2:])
    return dense

# Benchmarking mode disable some expensive assertions.
Benchmarking = True

reduction_sum = lambda x: x.sum(dim=tuple(range(1, x.dim())), keepdim=True)
reduction_mean = lambda x: x.mean(dim=tuple(range(1, x.dim())), keepdim=True)
reduction_max = lambda x: x.amax(dim=tuple(range(1, x.dim())), keepdim=True)
reduction_min = lambda x: x.amin(dim=tuple(range(1, x.dim())), keepdim=True)

MIN_HALF_FP = 5e-8  # 2**-24, which is the smallest value that float16 can be represented


def reduction_str2func(reduction_func):
    if type(reduction_func) == str:
        if reduction_func == 'min':
            return reduction_min
        elif reduction_func == 'max':
            return reduction_max
        elif reduction_func == 'sum':
            return reduction_sum
        elif reduction_func == 'mean':
            return reduction_mean
        else:
            raise NotImplementedError(f'Unknown reduction_func {reduction_func}')
    else:
        return reduction_func

def stop_criterion_placeholder(threshold=0):
    return lambda x: RuntimeError("BUG: bound optimization stop criterion not specified.")

def stop_criterion_min(threshold=0):
    return lambda x: (x.min(1, keepdim=True).values > threshold)

def stop_criterion_all(threshold=0):
    # The dimension of x should be (batch, spec). The spec dimension
    # This was used in the incomplete verifier, where the spec dimension can
    # present statements in an OR clause.
    return lambda x: (x > threshold).all(dim=1, keepdim=True)

def stop_criterion_max(threshold=0):
    return lambda x: (x.max(1, keepdim=True).values > threshold)

def stop_criterion_batch(threshold=0):
    # may unexpected broadcast, pay attention to the shape of threshold
    # x shape: batch, number_bounds; threshold shape: batch, number_bounds
    return lambda x: (x > threshold)

def stop_criterion_batch_any(threshold=0):
    """If any spec >= rhs, then this sample can be stopped;
       if all samples can be stopped, stop = True, o.w., False.
    """
    # may unexpected broadcast, pay attention to the shape of threshold
    # x shape: batch, number_bounds; threshold shape: batch, number_bounds
    return lambda x: (x > threshold).any(dim=1, keepdim=True)

def stop_criterion_general(or_spec_size, threshold=0):
    """
    If any spec in a group >= rhs, then this group can be stopped;
    if all groups can be stopped, stop = True, o.w., False.
    Args:
        or_clause_indices: [num_clause]. the indices of the belonging OR clauses for AND clauses.
        num_or: the number of OR clauses.
        threshold: [batch, num_clause]. The threshold for each spec. sum(or_clause_indices) == num_clauses.
    """
    def stop_criterion_per_or(x):
        # get the indices of OR clauses assigned to their corresponding atom clauses, [num_clause]
        num_or = or_spec_size.shape[0]
        or_clause_indices = torch.repeat_interleave(
            torch.arange(num_or, device=or_spec_size.device), or_spec_size
        ).view(1, -1).expand(x.shape)
        # get the result for each spec. [batch, num_clause]
        result_per_spec = (x > threshold) 
        # get the number of verified ANDs for each OR clause. [batch, num_or]
        num_verified_and_per_or = torch.scatter_reduce(result_per_spec[:, :num_or], 1, or_clause_indices, result_per_spec, 'sum', include_self=False)
        # result of any spec in a OR (group of ANDs) is True (sum >= 1) -> result of the OR is True.
        return num_verified_and_per_or >= 1
    # if all OR clauses are True, then return True. [batch, 1]
    return lambda x: stop_criterion_per_or(x).all(dim=1, keepdim=True)

def stop_criterion_batch_topk(threshold=0, k=1314):
    # x shape: batch, number_bounds; threshold shape: batch, number_bounds
    return lambda x: (torch.kthvalue(x, k, dim=-1, keepdim=True).values > threshold).any(dim=1)

def multi_spec_keep_func_all(x):
    return torch.all(x, dim=-1)


user_data_dir = appdirs.user_data_dir('auto_LiRPA')
if not os.path.exists(user_data_dir):
    try:
        os.makedirs(user_data_dir)
    except:
        logger.error('Failed to create directory {}'.format(user_data_dir))


class MultiAverageMeter(object):
    """Computes and stores the average and current value for multiple metrics"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.sum_meter = defaultdict(float)
        self.lasts = defaultdict(float)
        self.counts_meter = defaultdict(int)
        self.batch_size = 1

    def set_batch_size(self, batch_size):
        self.batch_size = batch_size

    def update(self, key, val, n=None):
        if val is None:
            return
        if n is None:
            n = self.batch_size
        if isinstance(val, torch.Tensor):
            val = val.item()
        self.lasts[key] = val
        self.sum_meter[key] += val * n
        self.counts_meter[key] += n

    def last(self, key):
        return self.lasts[key]

    def avg(self, key):
        if self.counts_meter[key] == 0:
            return 0.0
        else:
            return self.sum_meter[key] / self.counts_meter[key]

    def __repr__(self):
        s = ""
        for k in self.sum_meter:
            s += "{}={:.4f} ".format(k, self.avg(k))
        return s.strip()


class MultiTimer(object):
    """Count the time for each part of training."""
    def __init__(self):
        self.reset()
    def reset(self):
        self.timer_starts = defaultdict(float)
        self.timer_total = defaultdict(float)
    def start(self, key):
        if self.timer_starts[key] != 0:
            raise RuntimeError("start() is called more than once")
        self.timer_starts[key] = time.time()
    def stop(self, key):
        if key not in self.timer_starts:
            raise RuntimeError("Key does not exist; please call start() before stop()")
        self.timer_total[key] += time.time() - self.timer_starts[key]
        self.timer_starts[key] = 0
    def total(self, key):
        return self.timer_total[key]
    def __repr__(self):
        s = ""
        for k in self.timer_total:
            s += "{}_time={:.3f} ".format(k, self.timer_total[k])
        return s.strip()


class Flatten(nn.Flatten):
    """Legacy Flatten class.

    It was previously created when nn.Flatten was not supported. Simply use
    nn.Flatten in the future."""
    pass


class Unflatten(nn.Module):
    def __init__(self, wh):
        super().__init__()
        self.wh = wh # width and height of the feature maps
    def forward(self, x):
        return x.view(x.size(0), -1, self.wh, self.wh)


class Max(nn.Module):

    def __init__(self):
        super(Max, self).__init__()

    def forward(self, x, y):
        return torch.max(x, y)


class Min(nn.Module):

    def __init__(self):
        super(Min, self).__init__()

    def forward(self, x, y):
        return torch.min(x, y)


def scale_gradients(optimizer, gradient_accumulation_steps, grad_clip=None):
    parameters = []
    for param_group in optimizer.param_groups:
        for param in param_group['params']:
            parameters.append(param)
            if param.grad is not None:
                param.grad.data /= gradient_accumulation_steps
    if grad_clip is not None:
        return torch.nn.utils.clip_grad_norm_(parameters, grad_clip)


# unpack tuple, dict, list into one single list
# TODO: not sure if the order matches graph.inputs()
def unpack_inputs(inputs, device=None):
    if isinstance(inputs, dict):
        inputs = list(inputs.values())
    if isinstance(inputs, tuple) or isinstance(inputs, list):
        res = []
        for item in inputs:
            res += unpack_inputs(item, device=device)
        return res
    else:
        if device is not None:
            inputs = inputs.to(device)
        return [inputs]


def isnan(x):
    if isinstance(x, Patches):
        return False
    return torch.isnan(x).any()


def prod(x):
    return reduce(operator.mul, x, 1)


def batched_index_select(input, dim, index):
    # Assuming the input has a batch dimension.
    # index has dimensin [spec, batch].
    if input.ndim == 4:
        # Alphas for fully connected layers, shape [2, spec, batch, neurons]
        index = index.unsqueeze(-1).unsqueeze(0).expand(input.size(0), -1, -1, input.size(3))
    elif input.ndim == 6:
        # Alphas for fully connected layers, shape [2, spec, batch, c, h, w].
        index = index.view(1, index.size(0), index.size(1), *([1] * (input.ndim - 3))).expand(input.size(0), -1, -1, *input.shape[3:])
    elif input.ndim == 3:
        # Weights.
        input = input.expand(index.size(0), -1, -1)
        index = index.unsqueeze(-1).expand(-1, -1, input.size(2))
    elif input.ndim == 2:
        # Bias.
        input = input.expand(index.size(0), -1)
    else:
        raise ValueError
    return torch.gather(input, dim, index)


def get_spec_matrix(X, y, num_classes):
    with torch.no_grad():
        c = (torch.eye(num_classes).type_as(X)[y].unsqueeze(1)
            - torch.eye(num_classes).type_as(X).unsqueeze(0))
        I = (~(y.unsqueeze(1) == torch.arange(num_classes).type_as(y).unsqueeze(0)))
        c = (c[I].view(X.size(0), num_classes - 1, num_classes))
    return c


def unravel_index(
    indices: torch.LongTensor,
    shape: Tuple[int, ...],
) -> torch.LongTensor:
    r"""Converts flat indices into unraveled coordinates in a target shape.

    Args:
        indices: A tensor of (flat) indices, (*, N).
        shape: The targeted shape, (D,).

    Returns:
        The unraveled coordinates, a list with tensors in shape (N, D).

    Code borrowed from:
        https://github.com/pytorch/pytorch/issues/35674
    """

    coord = []

    for dim in reversed(shape):
        coord.append(indices % dim)
        indices = torch.div(indices, dim, rounding_mode='trunc')

    return list(reversed(coord))


class AutoBatchSize:
    def __init__(self, init_batch_size, device, vram_ratio=0.9, enable=True):
        self.batch_size = init_batch_size
        self.max_actual_batch_size = 0
        self.device = device
        self.vram_ratio = vram_ratio
        self.enable = enable

    def record_actual_batch_size(self, actual_batch_size):
        """Record the actual batch size used.

        It may be smaller than self.batch_size, especially for the early batches.
        """
        self.max_actual_batch_size = max(self.max_actual_batch_size, actual_batch_size)

    def update(self):
        """Check if the batch size can be enlarged."""
        if not self.enable:
            return None
        # Only try to update the batch size if the current batch size has
        # been actually used, as indicated by `max_actual_batch_size`
        if self.device == 'cpu' or self.max_actual_batch_size < self.batch_size:
            return None
        total_vram = torch.cuda.get_device_properties(self.device).total_memory
        current_vram = torch.cuda.memory_reserved(self.device)
        if current_vram * 2 >= total_vram * self.vram_ratio:
            return None
        new_batch_size = self.batch_size * 2
        self.batch_size = new_batch_size
        logger.debug('Automatically updated batch size to %d', new_batch_size)
        return {
            'current_vram': current_vram,
            'total_vram': total_vram,
        }


def sync_params(model_ori: torch.nn.Module,
                model: 'BoundedModule',
                loss_fusion: bool = False):
    """Sync the parameters from a BoundedModule to the original model."""
    state_dict_loss = model.state_dict()
    state_dict = model_ori.state_dict()
    for name in state_dict_loss:
        v = state_dict_loss[name]
        if name.endswith('.param'):
            name = name[:-6]
        elif name.endswith('.buffer'):
            name = name[:-7]
        else:
            raise NameError(name)
        name_ori = model[name].ori_name
        if loss_fusion:
            assert name_ori.startswith('model.')
            name_ori = name_ori[6:]
        assert name_ori in state_dict
        state_dict[name_ori] = v
    model_ori.load_state_dict(state_dict)
    return state_dict


def reduce_broadcast_dims(A, target_shape, left_extra_dims=1):
    """
    When backward propagating tensors that are automatically broadcasted,
    we need to reduce the broadcasted dimensions to match the input shape.
    This can be useful for backward bound propagation and backward gradient
    computation.

    Args:
        A: The input tensor.
        target_shape: The target shape to reduce to.
        left_extra_dims: The number of dimensions that A should have but the target
            shape doesn't have. These dimensions are usually added to the left of the
            target shape and don't need to be reduced (e.g. spec).

    Example:
        x1 has shape [a1, a2, a3, a4], x2 has shape [a2, 1, a4], y = x1 * x2.
        Two types of broadcasting here:
            1. Adding additional dimensions to x2 to match the dimension of x1.
            2. Broadcasting along existing dimensions length 1.
        In backward computation from y to x2, we need to reduce (sum) the A matrix
        to match the shape of x2. The first dimension of A is usually for spec, so
        the shape usually aligns from the second dimension.
    """
    # Step 1: Dimension doesn't exist in target shape but exists in A.
    # cnt_sum is the number of dimensions that are broadcast.
    # (The additional dimensions in A that are not in target shape).
    cnt_sum = (A.ndim - left_extra_dims) - len(target_shape)
    # The broadcast dimensions must be the first dimensions in A
    # (except the extra dimensions and batch dimension).
    dims = list(range(left_extra_dims + 1, cnt_sum + left_extra_dims + 1))
    if dims:
        A = torch.sum(A, dim=dims, keepdim=False)
    # Step 2: Dimension exists in target shape, broadcast from 1.
    # FIXME (05/11/2022): the following condition is not always correct.
    # We should not rely on checking dimension is "1" or not.
    dims = [i + left_extra_dims for i in range(left_extra_dims, len(target_shape))
            if target_shape[i] == 1 and A.shape[i + left_extra_dims] != 1]
    if dims:
        A = torch.sum(A, dim=dims, keepdim=True)
    # Check the final shape - it should be compatible.
    assert A.shape[2:] == target_shape[1:]  # skip the spec and batch dimension.
    return A


@torch.jit.script
def matmul_maybe_batched(a: torch.Tensor, b: torch.Tensor, both_batched: bool):
    # Basically just matmul, but we need to handle the batch dimension.
    if both_batched:
        return torch.einsum("b...ij,b...jk->b...ik", a, b)
    else:
        return a.matmul(b)

def transfer(tensor, device=None, dtype=None, non_blocking=False):
    """Transfer a tensor to a specific device or dtype."""
    if device:
        tensor = tensor.to(device, non_blocking=non_blocking)
    if dtype:
        tensor = tensor.to(dtype)

    return tensor


def clone_sub_A_dict(A_dict, out_in_keys: Tuple):
    """
    Deep copy the A_dict structure for specific out_in_keys.
    Args:
        A_dict: The A_dict to be copied.
        out_in_keys: The (out_key, in_key) pairs to be copied.
    Returns:
        A new A_dict with all tensors cloned.
    """
    # Structure: A_dict[out_key][in_key][key]
    # key in [lA, uA, lbias, ubias, unstable_idx]
    # lA, uA are tensors or Patches
    # (there're also types like eyeC, OneHotC, not supported here)
    # lbias, ubias are tensors
    # unstable_idx is tensor or tuple of tensors

    out_key, in_key = out_in_keys
    src_subdict = A_dict[out_key][in_key]
    cloned_subdict = {}

    for key, val in src_subdict.items():
        if val is None:
            cloned_subdict[key] = None
            continue

        if isinstance(val, (torch.Tensor, Patches)):
            cloned_subdict[key] = val.detach().clone()
        elif isinstance(val, tuple):
            cloned_subdict[key] = tuple(v.detach().clone() for v in val)
        else:
            raise NotImplementedError(f'Unsupported A type {type(val)} for copying.')
    return cloned_subdict


def clone_full_A_dict(A_dict):
    """
    Deep copy the A_dict structure.
    Args:
        A_dict: The A_dict to be copied.
    Returns:
        A new A_dict with all tensors cloned.
    """
    new_A_dict = {}
    for out_key, in_dict in A_dict.items():
        new_A_dict[out_key] = {}
        for in_key in in_dict:
            new_A_dict[out_key][in_key] = clone_sub_A_dict(A_dict, (out_key, in_key))
    return new_A_dict

================================================
FILE: auto_LiRPA/wrapper.py
================================================
#########################################################################
##   This file is part of the auto_LiRPA library, a core part of the   ##
##   α,β-CROWN (alpha-beta-CROWN) neural network verifier developed    ##
##   by the α,β-CROWN Team                                             ##
##                                                                     ##
##   Copyright (C) 2020-2025 The α,β-CROWN Team                        ##
##   Team leaders:                                                     ##
##          Faculty:   Huan Zhang <huan@huan-zhang.com> (UIUC)         ##
##          Student:   Xiangru Zhong <xiangru4@illinois.edu> (UIUC)    ##
##                                                                     ##
##   See CONTRIBUTORS for all current and past developers in the team. ##
##                                                                     ##
##     This program is licensed under the BSD 3-Clause License,        ##
##        contained in the LICENCE file in this directory.             ##
##                                                                     ##
#########################################################################
import torch
import torch.nn as nn

class CrossEntropyWrapper(nn.Module):
    def __init__(self, model):
        super(CrossEntropyWrapper, self).__init__()
        self.model = model

    def forward(self, x, labels):
        y = self.model(x)
        logits = y - torch.gather(y, dim=-1, index=labels.unsqueeze(-1))
        return torch.exp(logits).sum(dim=-1, keepdim=True)

class CrossEntropyWrapperMultiInput(nn.Module):
    def __init__(self, model):
        super(CrossEntropyWrapperMultiInput, self).__init__()
        self.model = model

    def forward(self, labels, *x):
        y = self.model(*x)
        logits = y - torch.gather(y, dim=-1, index=labels.unsqueeze(-1))
        return torch.exp(logits).sum(dim=-1, keepdim=True)

================================================
FILE: doc/.gitignore
================================================
_build
sections
*.md
!src/*.md
!README.md

================================================
FILE: doc/Makefile
================================================
# Minimal makefile for Sphinx documentation
#

# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS    ?=
SPHINXBUILD   ?= sphinx-build
SOURCEDIR     = .
BUILDDIR      = _build

# Put it first so that "make" without argument is like "make help".
help:
	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

.PHONY: help Makefile

# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


================================================
FILE: doc/README.md
================================================
# Documentation

This directory contains source files for building our documentation.
Please view the compiled documentation on our [documentation page](https://auto-lirpa.readthedocs.io/en/latest/?badge=latest), as some links may not work here on GitHub.

## Dependencies

Install additional libraries for building documentations:

```bash
pip install -r requirements.txt
```

## Build

Build documentations in HTML:

```
make html
```

The documentation will be generated at `_build/html`.


================================================
FILE: doc/api.rst
================================================
API Usage
======================================

.. autoclass:: auto_LiRPA.BoundedModule

   .. autofunction:: auto_LiRPA.BoundedModule.forward
   .. autofunction:: auto_LiRPA.BoundedModule.compute_bounds
   .. autofunction:: auto_LiRPA.BoundedModule.save_intermediate

.. autoclass:: auto_LiRPA.bound_ops.Bound

   .. autofunction:: auto_LiRPA.bound_ops.Bound.forward
   .. autofunction:: auto_LiRPA.bound_ops.Bound.interval_propagate
   .. autofunction:: auto_LiRPA.bound_ops.Bound.bound_forward
   .. autofunction:: auto_LiRPA.bound_ops.Bound.bound_backward

.. autoclass:: auto_LiRPA.perturbations.Perturbation

   .. autofunction:: auto_LiRPA.perturbations.Perturbation.concretize
   .. autofunction:: auto_LiRPA.perturbations.Perturbation.init

Indices and tables
-------------------

* :ref:`genindex`
* :ref:`search`

..
   * :ref:`modindex`

================================================
FILE: doc/conf.py
================================================
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html

# -- Path setup --------------------------------------------------------------

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import subprocess
import inspect
import sys
from pygit2 import Repository
sys.path.insert(0, '..')
import auto_LiRPA

subprocess.run(['python', 'process.py'])

# -- Project information -----------------------------------------------------

project = 'auto_LiRPA'
author = '<a href="https://github.com/Verified-Intelligence/auto_LiRPA#developers-and-copyright">auto-LiRPA authors</a>'
copyright = f'2020-2025, {author}'


# -- General configuration ---------------------------------------------------

# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
    'sphinx.ext.autodoc',
    'sphinx.ext.linkcode',
    'm2r2',
]

# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ['_build', 'src', 'Thumbs.db', '.DS_Store']

# -- Options for HTML output -------------------------------------------------

# The theme to use for HTML and HTML Help pages.  See the documentation for
# a list of builtin themes.
#
html_theme = 'alabaster'

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']

repo = Repository('../')
branch = repo.head.shorthand

# Resolve function for the linkcode extension.
def linkcode_resolve(domain, info):
    def find_source():
        obj = auto_LiRPA
        parts = info['fullname'].split('.')
        if info['module'].endswith(f'.{parts[0]}'):
            module = info['module'][:-len(parts[0])-1]
        else:
            module = info['module']
        obj = sys.modules[module]
        for part in parts:
            obj = getattr(obj, part)
        fn = inspect.getsourcefile(obj)
        source, lineno = inspect.getsourcelines(obj)
        return fn, lineno, lineno + len(source) - 1

    fn, lineno_start, lineno_end = find_source()
    filename = f'{fn}#L{lineno_start}-L{lineno_end}'

    return f"https://github.com/Verified-Intelligence/auto_LiRPA/blob/{branch}/doc/{filename}"


================================================
FILE: doc/index.rst
================================================
.. auto_LiRPA documentation master file, created by
   sphinx-quickstart on Wed Jul 14 21:56:10 2021.
   You can adapt this file completely to your liking, but it should at least
   contain the root `toctree` directive.

Documentation for `auto_LiRPA <https://github.com/Verified-Intelligence/auto_LiRPA>`_
===========================================================================

.. toctree::
   :hidden:

   installation
   quick-start
   examples
   api
   custom_op
   paper

.. raw:: html

   <p align="center">
   <a href="http://PaperCode.cc/AutoLiRPA-Video"><img src="http://www.huan-zhang.com/images/upload/lirpa/auto_lirpa_2.png" width="45%" height="45%" float="left"></a>
   <a href="http://PaperCode.cc/AutoLiRPA-Video"><img src="http://www.huan-zhang.com/images/upload/lirpa/auto_lirpa_1.png" width="45%" height="45%" float="right"></a>
   </p>

.. mdinclude:: sections/introduction.md

Usage
-----

* :doc:`Installation <sections/installation>`
* :doc:`Quick Start <sections/quick-start>`
* :doc:`More Working Examples <examples>`
* :doc:`API Usage <api>`
* :doc:`Custom Operators <custom_op>`
* :doc:`Reproducing our NeurIPS 2020 paper <paper>`


================================================
FILE: doc/process.py
================================================
""" Process source files before running Sphinx"""
import re
import os
import shutil
from pygit2 import Repository

repo = 'https://github.com/Verified-Intelligence/auto_LiRPA'
branch = Repository('.').head.shorthand
repo_file_path = os.path.join(repo, 'tree', branch)

# Parse README.md into sections which can be reused
heading = ''
copied = {}
print('Parsing markdown sections from README:')
with open('../README.md') as file:
    for line in file.readlines():
        if line.startswith('##'):
            heading = line[2:].strip()
        else:
            if not heading in copied:
                copied[heading] = ''
            copied[heading] += line
if not os.path.exists('sections'):
    os.makedirs('sections')
for key in copied:
    if key == '':
        continue
    filename = re.sub(r"[?+\'\"]", '', key.lower())
    filename = re.sub(r" ", '-', filename) + '.md'
    print(filename)
    with open(os.path.join('sections', filename), 'w') as file:
        file.write(f'## {key}\n')
        file.write(copied[key])
print()

# Load source files and fix links to GitHub
for folder in ['src', 'sections']:
    for filename in os.listdir(folder):
        print(f'Processing {folder}/{filename}')
        with open(os.path.join(folder, filename)) as file:
            source = file.read()
        source_new = ''
        ptr = 0
        for m in re.finditer('(\[.*\])(\(.*\))', source):
            assert m.start() >= ptr
            source_new += source[ptr:m.start()]
            ptr = m.start()
            source_new += m.group(1)
            ptr += len(m.group(1))
            link_raw = m.group(2)
            while len(link_raw) >= 2 and link_raw[-2] == ')':
                link_raw = link_raw[:-1]
            link = link_raw[1:-1]
            if link.startswith('https://') or link.startswith('http://') or '.html#' in link:
                link_new = link
            else:
                if folder == 'sections':
                    link_new = os.path.join(repo_file_path, link)
                else:
                    link_new = os.path.join(repo_file_path, 'docs/src', link)
                print(f'Fix link {link} -> {link_new}')
            source_new += f'({link_new})'
            ptr += len(link_raw)
        source_new += source[ptr:]
        with open(filename, 'w') as file:
            file.write(source_new)
        print()


================================================
FILE: examples/.gitignore
================================================
auto_LiRPA


================================================
FILE: examples/__init__.py
================================================


================================================
FILE: examples/language/.gitignore
================================================
model*
!modeling*
log*
res_test.pkl
ckpt_*
data_language.tar.gz
data/


================================================
FILE: examples/language/Transformer/Transformer.py
================================================
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights   rved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import, division, print_function

import os
import torch
import torch.nn as nn

from Transformer.modeling import BertForSequenceClassification
from pytorch_pretrained_bert.modeling import BertConfig
from Transformer.utils import convert_examples_to_features
from language_utils import build_vocab
from auto_LiRPA.utils import logger


class Transformer(nn.Module):
    def __init__(self, args, data_train):
        super().__init__()
        self.args = args
        self.max_seq_length = args.max_sent_length
        self.drop_unk = args.drop_unk
        self.num_labels = args.num_classes
        self.label_list = range(args.num_classes)
        self.device = args.device
        self.lr = args.lr

        self.dir = args.dir
        self.vocab = build_vocab(data_train, args.min_word_freq)
        if not os.path.exists(self.dir):
            os.makedirs(self.dir)
        self.checkpoint = 0
        config = BertConfig(len(self.vocab))
        config.num_hidden_layers = args.num_layers
        config.embedding_size = args.embedding_size
        config.hidden_size = args.hidden_size
        config.intermediate_size = args.intermediate_size
        config.hidden_act = args.hidden_act
        config.num_attention_heads = args.num_attention_heads
        config.layer_norm = args.layer_norm
        config.hidden_dropout_prob = args.dropout
        self.model = BertForSequenceClassification(
            config, self.num_labels, vocab=self.vocab).to(self.device)
        logger.info("Model initialized")
        if args.load:
            checkpoint = torch.load(args.load, map_location=torch.device(self.device))
            epoch = checkpoint['epoch']
            self.model.embeddings.load_state_dict(checkpoint['state_dict_embeddings'])
            self.model.model_from_embeddings.load_state_dict(checkpoint['state_dict_model_from_embeddings'])
            logger.info('Checkpoint loaded: {}'.format(args.load))

        self.model_from_embeddings = self.model.model_from_embeddings
        self.word_embeddings = self.model.embeddings.word_embeddings
        self.model_from_embeddings.device = self.device

    def save(self, epoch):
        self.model.model_from_embeddings = self.model_from_embeddings
        path = os.path.join(self.dir, "ckpt_{}".format(epoch))
        torch.save({
            'state_dict_embeddings': self.model.embeddings.state_dict(),
            'state_dict_model_from_embeddings': self.model.model_from_embeddings.state_dict(),
            'epoch': epoch
        }, path)
        logger.info("Model saved to {}".format(path))

    def build_optimizer(self):
        # update the original model with the converted model
        self.model.model_from_embeddings = self.model_from_embeddings
        param_group = [
            {"params": [p[1] for p in self.model.named_parameters()], "weight_decay": 0.},
        ]
        return torch.optim.Adam(param_group, lr=self.lr)

    def train(self):
        self.model.train()
        self.model_from_embeddings.train()

    def eval(self):
        self.model.eval()
        self.model_from_embeddings.eval()

    def get_input(self, batch):
        features = convert_examples_to_features(
            batch, self.label_list, self.max_seq_length, self.vocab, drop_unk=self.drop_unk)

        input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long).to(self.device)
        input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long).to(self.device)
        segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long).to(self.device)
        label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long).to(self.device)
        tokens = [f.tokens for f in features]

        embeddings, extended_attention_mask = \
            self.model(input_ids, segment_ids, input_mask, embed_only=True)

        return embeddings, extended_attention_mask, tokens, label_ids

    def forward(self, batch):
        embeddings, extended_attention_mask, tokens, label_ids = self.get_input(batch)
        logits = self.model_from_embeddings(embeddings, extended_attention_mask)
        preds = torch.argmax(logits, dim=1)
        return preds

================================================
FILE: examples/language/Transformer/__init__.py
================================================


================================================
FILE: examples/language/Transformer/modeling.py
================================================
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch BERT model."""

from __future__ import absolute_import, division, print_function, unicode_literals

import torch
from torch import nn

from pytorch_pretrained_bert.modeling import BertIntermediate, BertSelfAttention, BertPreTrainedModel

class BertLayerNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-12):
        super(BertLayerNorm, self).__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.bias = nn.Parameter(torch.zeros(hidden_size))
        self.variance_epsilon = eps

    def forward(self, x):
        u = x.mean(-1, keepdim=True)
        s = (x - u).pow(2).mean(-1, keepdim=True)
        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
        return self.weight * x + self.bias

class BertLayerNormNoVar(nn.Module):
    def __init__(self, hidden_size, eps=1e-12):
        super(BertLayerNormNoVar, self).__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.bias = nn.Parameter(torch.zeros(hidden_size))
        self.variance_epsilon = eps

    def forward(self, x):
        u = x.mean(-1, keepdim=True)
        x = x - u
        return self.weight * x + self.bias

class BertEmbeddings(nn.Module):
    """Construct the embeddings from word, position and token_type embeddings.
    """
    def __init__(self, config, glove=None, vocab=None):
        super(BertEmbeddings, self).__init__()
        self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=0)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size)

        self.config = config

    def forward(self, input_ids, token_type_ids=None):
        seq_length = input_ids.size(1)
        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids)

        words_embeddings = self.word_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        # position/token_type embedding disabled
        # embeddings = words_embeddings + position_embeddings + token_type_embeddings

        embeddings = words_embeddings
        return embeddings

class BertSelfOutput(nn.Module):
    def __init__(self, config):
        super(BertSelfOutput, self).__init__()
        self.config = config
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        if hasattr(config, "layer_norm") and config.layer_norm == "no_var":
            self.LayerNorm = BertLayerNormNoVar(config.hidden_size, eps=1e-12)
        else:
            self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        if hidden_states.shape[-1] == input_tensor.shape[-1]:
            hidden_states = hidden_states + input_tensor
        if hasattr(self.config, "layer_norm") and self.config.layer_norm == "no":
            pass
        else:
            hidden_states = self.LayerNorm(hidden_states)
        return hidden_states

class BertAttention(nn.Module):
    def __init__(self, config, input_size):
        super(BertAttention, self).__init__()
        self.self = BertSelfAttention(config)
        self.output = BertSelfOutput(config)

    def forward(self, input_tensor, attention_mask):
        self_output = self.self(input_tensor, attention_mask)
        attention_output = self.output(self_output, input_tensor)

        return attention_output

class BertOutput(nn.Module):
    def __init__(self, config):
        super(BertOutput, self).__init__()
        self.config = config
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        if hasattr(config, "layer_norm") and config.layer_norm == "no_var":
            self.LayerNorm = BertLayerNormNoVar(config.hidden_size, eps=1e-12)
        else:
            self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = hidden_states + input_tensor
        if hasattr(self.config, "layer_norm") and self.config.layer_norm == "no":
            pass
        else:
            hidden_states = self.LayerNorm(hidden_states)
        return hidden_states

class BertLayer(nn.Module):
    def __init__(self, config, layer_id):
        super(BertLayer, self).__init__()
        self.input_size = config.hidden_size
        self.attention = BertAttention(config, self.input_size)
        self.intermediate = BertIntermediate(config)
        self.output = BertOutput(config)

    def forward(self, hidden_states, attention_mask):
        attention_output = self.attention(hidden_states, attention_mask)
        intermediate_output = self.intermediate(attention_output)
        layer_output = self.output(intermediate_output, attention_output)

        return layer_output

class BertEncoder(nn.Module):
    def __init__(self, config):
        super(BertEncoder, self).__init__()
        self.layer = nn.ModuleList([BertLayer(config, i) for i in range(config.num_hidden_layers)])

    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
        all_encoder_layers = []
        for layer_module in self.layer:
            hidden_states = layer_module(hidden_states, attention_mask)
            if output_all_encoded_layers:
                all_encoder_layers.append(hidden_states)
        if not output_all_encoded_layers:
            all_encoder_layers.append(hidden_states)
        return all_encoder_layers

class BertPooler(nn.Module):
    def __init__(self, config):
        super(BertPooler, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()

    def forward(self, hidden_states):
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token.
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output

class BertModelFromEmbeddings(BertPreTrainedModel):
    def __init__(self, config):
        super(BertModelFromEmbeddings, self).__init__(config)
        self.encoder = BertEncoder(config)
        self.pooler = BertPooler(config)
        self.apply(self.init_bert_weights)

    def forward(self, embeddings, extended_attention_mask):
        encoded_layers  = self.encoder(embeddings, extended_attention_mask)
        sequence_output = encoded_layers[-1]
        pooled_output = self.pooler(sequence_output)
        return pooled_output

class BertForSequenceClassificationFromEmbeddings(BertPreTrainedModel):
    def __init__(self, config, num_labels=2):
        super(BertForSequenceClassificationFromEmbeddings, self).__init__(config)
        self.num_labels = num_labels
        self.bert = BertModelFromEmbeddings(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, num_labels)
        self.linear_in = nn.Linear(config.embedding_size, config.hidden_size)

        self.layer_norm = config.layer_norm
        if hasattr(config, "layer_norm") and config.layer_norm == "no_var":
            self.LayerNorm = BertLayerNormNoVar(config.embedding_size, eps=1e-12)
        else:
            self.LayerNorm = BertLayerNorm(config.embedding_size, eps=1e-12)

        self.apply(self.init_bert_weights)

    def forward(self, embeddings, extended_attention_mask):
        embeddings = self.linear_in(embeddings)

        if self.layer_norm == "no":
            pass
        else:
            embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)

        pooled_output = self.bert(embeddings, extended_attention_mask)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

class BertForSequenceClassification(BertPreTrainedModel):
    def __init__(self, config, num_labels=2, glove=None, vocab=None):
        super(BertForSequenceClassification, self).__init__(config)
        self.model_from_embeddings = BertForSequenceClassificationFromEmbeddings(
            config, num_labels
        )
        self.num_labels = num_labels
        self.embeddings = BertEmbeddings(config, glove=glove, vocab=vocab)
        self.apply(self.init_bert_weights)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, embed_only=False):
        if attention_mask is None:
            attention_mask = torch.ones_like(input_ids)
        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids)
        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
        embeddings = self.embeddings(input_ids, token_type_ids)
        if embed_only:
            return embeddings, extended_attention_mask
        logits = self.model_from_embeddings(embeddings, extended_attention_mask)
        return logits


================================================
FILE: examples/language/Transformer/utils.py
================================================
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights   rved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from language_utils import tokenize, token_to_id

class InputExample(object):
    def __init__(self, guid, text_a, text_b=None, label=None):
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

class InputFeatures(object):
    def __init__(self, input_ids, input_mask, segment_ids, label_id, tokens):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id
        self.tokens = tokens

def convert_examples_to_features(examples, label_list, max_seq_length,
                                vocab, drop_unk=False):
                                #tokenizer):
    """Loads a data file into a list of `InputBatch`s."""
    features = []
    all_tokens = tokenize(examples, vocab, max_seq_length - 2, drop_unk=drop_unk)
    for i in range(len(all_tokens)):
        all_tokens[i] = ["[CLS]"] + all_tokens[i] + ["[SEP]"]
    all_ids = token_to_id(all_tokens, vocab)

    max_seq_length = min(max_seq_length, max([len(tokens) for tokens in all_tokens]))
    for (ex_index, example) in enumerate(examples):
        tokens = all_tokens[ex_index]
        segment_ids = [0] * len(tokens)
        input_ids = all_ids[ex_index]
        input_mask = [1] * len(input_ids)
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        features.append(InputFeatures(
            input_ids=input_ids,
            input_mask=input_mask,
            segment_ids=segment_ids,
            label_id=example["label"],
            tokens=tokens))

    return features


================================================
FILE: examples/language/data_utils.py
================================================
import random
import json
from auto_LiRPA.utils import logger

def load_data_sst():
    data = []
    for split in ['train_all_nodes', 'train', 'dev', 'test']:
        with open('data/sst/{}.json'.format(split)) as file:
            data.append(json.loads(file.read()))
    return data

def load_data(dataset):    
    if dataset == "sst":
        return load_data_sst()
    else:
        raise NotImplementedError('Unknown dataset {}'.format(dataset))

def clean_data(data):
    return [example for example in data if example['candidates'] is not None]

def get_batches(data, batch_size):
    batches = []
    random.shuffle(data)
    for i in range((len(data) + batch_size - 1) // batch_size):
        batches.append(data[i * batch_size : (i + 1) * batch_size])
    return batches


================================================
FILE: examples/language/language_utils.py
================================================
from auto_LiRPA.utils import logger
import numpy as np

def build_vocab(data_train, min_word_freq, dump=False, include=[]):
    vocab = {
        '[PAD]': 0,
        '[UNK]': 1,
        '[CLS]': 2,
        '[SEP]': 3,
        '[MASK]': 4
    }
    cnt = {}
    for example in data_train:
        for token in example['sentence'].strip().lower().split():
            if token in cnt:
                cnt[token] += 1
            else:
                cnt[token] = 1
    for w in cnt:
        if cnt[w] >= min_word_freq or w in include:
            vocab[w] = len(vocab)
    logger.info('Vocabulary size: {}'.format(len(vocab)))

    if dump:
        with open('tmp/vocab.txt', 'w') as file:
            for w in vocab.keys():
                file.write('{}\n'.format(w))

    return vocab

def tokenize(batch, vocab, max_seq_length, drop_unk=False):
    res = []
    for example in batch:
        t = example['sentence'].strip().lower().split(' ')
        if drop_unk:
            tokens = [w for w in t if w in vocab][:max_seq_length]
        else:
            tokens = []
            for token in t[:max_seq_length]:
                if token in vocab:
                    tokens.append(token)
                else:
                    tokens.append('[UNK]')
        res.append(tokens)    
    return res

def token_to_id(tokens, vocab):
    ids = []
    for t in tokens:
        ids.append([vocab[w] for w in t])
    return ids

================================================
FILE: examples/language/lstm.py
================================================
import os
import shutil
import torch
import torch.nn as nn
import torch.nn.functional as F
from auto_LiRPA.utils import logger
from language_utils import build_vocab

class LSTMFromEmbeddings(nn.Module):
    def __init__(self, args, vocab_size):
        super(LSTMFromEmbeddings, self).__init__()

        self.embedding_size = args.embedding_size
        self.hidden_size = args.hidden_size
        self.num_classes = args.num_classes
        self.device = args.device

        self.cell_f = nn.LSTMCell(self.embedding_size, self.hidden_size)
        self.cell_b = nn.LSTMCell(self.embedding_size, self.hidden_size)
        self.linear = nn.Linear(self.hidden_size * 2, self.num_classes)
        if args.dropout is not None:
            self.dropout = nn.Dropout(p=args.dropout)
            logger.info('LSTM dropout: {}'.format(args.dropout))
        else:
            self.dropout = None

    def forward(self, embeddings, mask):
        if self.dropout is not None:
            embeddings = self.dropout(embeddings)
        embeddings = embeddings * mask.unsqueeze(-1)
        batch_size = embeddings.shape[0]
        length = embeddings.shape[1]
        h_f = torch.zeros(batch_size, self.hidden_size).to(embeddings.device)
        c_f = h_f.clone()
        h_b, c_b = h_f.clone(), c_f.clone()
        h_f_sum, h_b_sum = h_f.clone(), h_b.clone()
        for i in range(length):
            h_f, c_f = self.cell_f(embeddings[:, i], (h_f, c_f))
            h_b, c_b = self.cell_b(embeddings[:, length - i - 1], (h_b, c_b))
            h_f_sum = h_f_sum + h_f
            h_b_sum = h_b_sum + h_b
        states = torch.cat([h_f_sum / float(length), h_b_sum / float(length)], dim=-1)
        logits = self.linear(states)
        return logits

class LSTM(nn.Module):
    def __init__(self, args, data_train):
        super(LSTM, self).__init__()
        self.args = args
        self.embedding_size = args.embedding_size
        self.max_seq_length = args.max_sent_length
        self.min_word_freq = args.min_word_freq
        self.device = args.device
        self.lr = args.lr

        self.dir = args.dir
        if not os.path.exists(self.dir):
            os.makedirs(self.dir)
        self.vocab = self.vocab_actual = build_vocab(data_train, args.min_word_freq)
        self.checkpoint = 0

        if args.load:
            ckpt = torch.load(args.load, map_location=torch.device(self.device))
            self.embedding = torch.nn.Embedding(len(self.vocab), self.embedding_size)
            self.model_from_embeddings = LSTMFromEmbeddings(args, len(self.vocab))
            self.model = self.embedding, LSTMFromEmbeddings(args, len(self.vocab))
            self.embedding.load_state_dict(ckpt['state_dict_embedding'])
            self.model_from_embeddings.load_state_dict(ckpt['state_dict_model_from_embeddings'])
            self.checkpoint = ckpt['epoch']
        else:
            self.embedding = torch.nn.Embedding(len(self.vocab), self.embedding_size)
            self.model_from_embeddings = LSTMFromEmbeddings(args, len(self.vocab))
            self.model = self.embedding, LSTMFromEmbeddings(args, len(self.vocab))
            logger.info("Model initialized")
        self.embedding = self.embedding.to(self.device)
        self.model_from_embeddings = self.model_from_embeddings.to(self.device)
        self.word_embeddings = self.embedding

    def save(self, epoch):
        path = os.path.join(self.dir, 'ckpt_{}'.format(epoch))
        torch.save({
            'state_dict_embedding': self.embedding.state_dict(),
            'state_dict_model_from_embeddings': self.model_from_embeddings.state_dict(),
            'epoch': epoch
        }, path)
        logger.info('LSTM saved: {}'.format(path))

    def build_optimizer(self):
        self.model = (self.model[0], self.model_from_embeddings)
        param_group = []
        for m in self.model:
            for p in m.named_parameters():
                param_group.append(p)
        param_group = [{"params": [p[1] for p in param_group], "weight_decay": 0.}]
        return torch.optim.Adam(param_group, lr=self.lr)

    def get_input(self, batch):
        mask, tokens = [], []
        for example in batch:
            _tokens = []
            for token in example["sentence"].strip().lower().split(' ')[:self.max_seq_length]:
                if token in self.vocab:
                    _tokens.append(token)
                else:
                    _tokens.append("[UNK]")
            tokens.append(_tokens)
        max_seq_length = max([len(t) for t in tokens])
        token_ids = []
        for t in tokens:
            ids = [self.vocab[w] for w in t]
            mask.append(torch.cat([
                torch.ones(1, len(ids)),
                torch.zeros(1, self.max_seq_length - len(ids))
            ], dim=-1).to(self.device))
            ids += [self.vocab["[PAD]"]] * (self.max_seq_length - len(ids))
            token_ids.append(ids)
        embeddings = self.embedding(torch.tensor(token_ids, dtype=torch.long).to(self.device))
        mask = torch.cat(mask, dim=0)
        label_ids = torch.tensor([example["label"] for example in batch]).to(self.device)
        return embeddings, mask, tokens, label_ids

    def train(self):
        self.model_from_embeddings.train()

    def eval(self):
        self.model_from_embeddings.eval()


================================================
FILE: examples/language/oracle.py
================================================
import torch
from auto_LiRPA.utils import logger
from auto_LiRPA import PerturbationSynonym
from data_utils import get_batches

def oracle(args, model, ptb, data, type):
    logger.info('Running oracle for {}'.format(type))
    model.eval()
    assert(isinstance(ptb, PerturbationSynonym))
    cnt_cor = 0
    word_embeddings = model.word_embeddings.weight
    vocab = model.vocab    
    for t, example in enumerate(data):
        embeddings, mask, tokens, label_ids = model.get_input([example])
        candidates = example['candidates']
        if tokens[0][0] == '[CLS]':
            candidates = [[]] + candidates + [[]]   
        embeddings_all = []
        def dfs(tokens, embeddings, budget, index):
            if index == len(tokens):
                embeddings_all.append(embeddings.cpu())
                return
            dfs(tokens, embeddings, budget, index + 1)
            if budget > 0 and tokens[index] != '[UNK]' and len(candidates[index]) > 0\
                    and tokens[index] == candidates[index][0]:
                for w in candidates[index][1:]:
                    if w in vocab:
                        _embeddings = torch.cat([
                            embeddings[:index],
                            word_embeddings[vocab[w]].unsqueeze(0),
                            embeddings[index + 1:]
                        ], dim=0)
                        dfs(tokens, _embeddings, budget - 1, index + 1)
        dfs(tokens[0], embeddings[0], ptb.budget, 0)
        cor = True
        for embeddings in get_batches(embeddings_all, args.oracle_batch_size):
            embeddings_tensor = torch.cat(embeddings).cuda().reshape(len(embeddings), *embeddings[0].shape)
            logits = model.model_from_embeddings(embeddings_tensor, mask)        
            for pred in list(torch.argmax(logits, dim=1)):
                if pred != example['label']:
                    cor = False
            if not cor: break
        cnt_cor += cor

        if (t + 1) % args.log_interval == 0:
            logger.info('{} {}/{}: oracle robust acc {:.3f}'.format(type, t + 1, len(data), cnt_cor * 1. / (t + 1)))
    logger.info('{}: oracle robust acc {:.3f}'.format(type, cnt_cor * 1. / (t + 1)))
    

================================================
FILE: examples/language/preprocess/pre_compute_lm_scores.py
================================================
# Ref: https://worksheets.codalab.org/rest/bundles/0x3f614472f4a14393b3d85d5568114591/contents/blob/precompute_lm_scores.py

"""Precompute language model scores."""
import argparse
import json
import os
import sys
import torch
from tqdm import tqdm

from data_utils import load_data

sys.path.insert(0, 'tmp/windweller-l2w/adaptive_softmax')
import query as lmquery

OPTS = None

def parse_args():
  parser = argparse.ArgumentParser('Insert a description of this script.')
  parser.add_argument('--data', type=str, default='sst')
  parser.add_argument('--out', default='tmp')
  parser.add_argument('--window-radius', '-w', type=int, default=6)
  parser.add_argument('--neighbor-file', type=str, default='tmp/synonyms.json')
  return parser.parse_args()

def main():
  device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
  query_handler = lmquery.load_model(device)
  with open(OPTS.neighbor_file) as f:
    neighbors = json.load(f)

  data_train_warmup, data_train, data_dev, data_test = load_data(OPTS.data)
  split = [('train', data_train), ('dev', data_dev), ('test', data_test)]

  for s in split:
    data = s[1]
    out_file = os.path.join(OPTS.out, '{}_lm_scores.txt'.format(s[0]))

    with open(out_file, 'w') as f:
      for sent_idx, example in enumerate(tqdm(data)):
        sentence = example["sentence"]
        print('%d\t%s' % (sent_idx, sentence), file=f)
        words = sentence.lower().strip().split(' ')
        for i, w in enumerate(words):
          if w in neighbors:
            options = [w] + neighbors[w]
            start = max(0, i - OPTS.window_radius)
            end = min(len(words), i + 1 + OPTS.window_radius)
            # Remove OOV words from prefix and suffix
            prefix = [x for x in words[start:i] if x in query_handler.word_to_idx]
            suffix = [x for x in words[i+1:end] if x in query_handler.word_to_idx]
            queries = []
            in_vocab_options = []
            for opt in options:
              if opt in query_handler.word_to_idx:
                queries.append(prefix + [opt] + suffix)
                in_vocab_options.append(opt)
              else:
                print('%d\t%d\t%s\t%s' % (sent_idx, i, opt, float('-inf')), file=f)
            if queries:
              log_probs = query_handler.query(queries, batch_size=16)
              for x, lp in zip(in_vocab_options, log_probs):
                print('%d\t%d\t%s\t%s' % (sent_idx, i, x, lp), file=f)
        f.flush()

if __name__ == '__main__':
  OPTS = parse_args()
  main()

================================================
FILE: examples/language/preprocess/preprocess_sst.py
================================================
import random, json

def load_data_sst():
    # training data
    path = "train-nodes.tsv"
    data_train_all_nodes = []  
    with open(path) as file:
        for line in file.readlines()[1:]:
            data_train_all_nodes.append({
                "sentence": line.split("\t")[0],
                "label": int(line.split("\t")[1])
            })   
     
    # train/dev/test data
    for subset in ["train", "dev", "test"]:
        path = "{}.txt".format(subset)
        data = []  
        with open(path) as file:
            for line in file.readlines():
                segs = line[:-1].split(" ")
                tokens, word_labels = [], []
                label = int(segs[0][1])
                if label < 2: 
                    label = 0
                elif label >= 3: 
                    label = 1
                else: 
                    continue
                for i in range(len(segs) - 1):
                    if segs[i][0] == "(" and segs[i][1] in ["0", "1", "2", "3", "4"]\
                            and segs[i + 1][0] != "(":
                        tokens.append(segs[i + 1][:segs[i + 1].find(")")])
                        word_labels.append(int(segs[i][1]))
                data.append({
                    "label": label,
                    "sentence": " ".join(tokens),
                    "word_labels": word_labels
                })
        if subset == "train":
            data_train = data
        elif subset == "dev":
            data_dev = data
        else:
            data_test = data

    return data_train_all_nodes, data_train, data_dev, data_test

def read_scores(split):
    res = {}
    with open('{}_lm_scores.txt'.format(split)) as file:
        line = file.readline().strip().split('\t')
        while True:
            if len(line) < 2: break
            sentence = line[-1]
            tokens = sentence.lower().split(' ')
            candidates = [[] for i in range(len(tokens))]
            while True:
                line = file.readline().strip().split('\t')
                if len(line) != 4: break
                pos, word, score = int(line[1]), line[2], float(line[3])
                if score == float('-inf'):
                    continue
                if len(candidates[pos]) == 0:
                    if word != tokens[pos]:
                        continue
                elif score < candidates[pos][0][1] - 5.0:
                    continue
                candidates[pos].append((word, score))
            res[sentence] = [[w[0] for w in cand] for cand in candidates]
    return res

data_train_all_nodes, data_train, data_dev, data_test = load_data_sst()
candidates_dev = read_scores('dev')
candidates_test = read_scores('test')
for example in data_dev:
    example['candidates'] = candidates_dev[example['sentence']]
for example in data_test:
    example['candidates'] = candidates_test[example['sentence']]
with open('train_all_nodes.json', 'w') as file:
    file.write(json.dumps(data_train_all_nodes))
with open('train.json', 'w') as file:
    file.write(json.dumps(data_train))
with open('dev.json', 'w') as file:
    file.write(json.dumps(data_dev))
with open('test.json', 'w') as file:
    file.write(json.dumps(data_test))


================================================
FILE: examples/language/train.py
================================================
import argparse
import random
import pickle
import os
import pdb
import time
import logging
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss
from torch.utils.tensorboard import SummaryWriter
from auto_LiRPA import BoundedModule, BoundedTensor, PerturbationSynonym, CrossEntropyWrapperMultiInput
from auto_LiRPA.utils import MultiAverageMeter, logger, scale_gradients
from auto_LiRPA.eps_scheduler import *
from Transformer.Transformer import Transformer
from lstm import LSTM
from data_utils import load_data, clean_data, get_batches
from oracle import oracle

parser = argparse.ArgumentParser()

parser.add_argument('--train', action='store_true')
parser.add_argument('--robust', action='store_true')
parser.add_argument('--oracle', action='store_true')
parser.add_argument('--dir', type=str, default='model')
parser.add_argument('--checkpoint', type=int, default=None)
parser.add_argument('--data', type=str, default='sst', choices=['sst'])
parser.add_argument('--seed', type=int, default=0)
parser.add_argument('--device', type=str, default='cuda', choices=['cuda', 'cpu'])
parser.add_argument('--load', type=str, default=None)
parser.add_argument('--legacy_loading', action='store_true', help='use a deprecated way of loading checkpoints for previously saved models')
parser.add_argument('--auto_test', action='store_true')

parser.add_argument('--eps', type=float, default=1.0)
parser.add_argument('--budget', type=int, default=6)
parser.add_argument('--method', type=str, default=None,
                    choices=['IBP', 'IBP+backward', 'IBP+backward_train', 'forward', 'forward+backward'])

parser.add_argument('--model', type=str, default='transformer',
                    choices=['transformer', 'lstm'])
parser.add_argument('--num_epochs', type=int, default=25)
parser.add_argument('--num_epochs_all_nodes', type=int, default=20)
parser.add_argument('--eps_start', type=int, default=1)
parser.add_argument('--eps_length', type=int, default=10)
parser.add_argument('--log_interval', type=int, default=100)
parser.add_argument('--min_word_freq', type=int, default=2)
parser.add_argument('--batch_size', type=int, default=32)
parser.add_argument('--oracle_batch_size', type=int, default=1024)
parser.add_argument('--gradient_accumulation_steps', type=int, default=1)
parser.add_argument('--max_sent_length', type=int, default=32)
parser.add_argument('--vocab_size', type=int, default=50000)
parser.add_argument('--lr', type=float, default=1e-4)
parser.add_argument('--lr_decay', type=float, default=1)
parser.add_argument('--grad_clip', type=float, default=10.0)
parser.add_argument('--num_classes', type=int, default=2)
parser.add_argument('--num_layers', type=int, default=1)
parser.add_argument('--num_attention_heads', type=int, default=4)
parser.add_argument('--hidden_size', type=int, default=64)
parser.add_argument('--embedding_size', type=int, default=64)
parser.add_argument('--intermediate_size', type=int, default=128)
parser.add_argument('--drop_unk', action='store_true')
parser.add_argument('--hidden_act', type=str, default='relu')
parser.add_argument('--layer_norm', type=str, default='no_var',
                    choices=['standard', 'no', 'no_var'])
parser.add_argument('--loss_fusion', action='store_true')
parser.add_argument('--dropout', type=float, default=0.1)
parser.add_argument('--bound_opts_relu', type=str, default='zero-lb')

args = parser.parse_args()

writer = SummaryWriter(os.path.join(args.dir, 'log'), flush_secs=10)
file_handler = logging.FileHandler(os.path.join(args.dir, 'log/train.log'))
file_handler.setFormatter(logging.Formatter('%(levelname)-8s %(asctime)-12s %(message)s'))
logger.addHandler(file_handler)

data_train_all_nodes, data_train, data_dev, data_test = load_data(args.data)
if args.robust:
    data_dev, data_test = clean_data(data_dev), clean_data(data_test)
if args.auto_test:
    random.seed(args.seed)
    random.shuffle(data_test)
    data_test = data_test[:10]
    assert args.batch_size >= 10
    # Use double precision and deterministic algorithm for automatic testing.
    os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
    torch.use_deterministic_algorithms(True)
    torch.set_default_dtype(torch.float64)

logger.info('Dataset sizes: {}/{}/{}/{}'.format(
    len(data_train_all_nodes), len(data_train), len(data_dev), len(data_test)))

random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)

dummy_embeddings = torch.zeros(1, args.max_sent_length, args.embedding_size, device=args.device)
dummy_labels = torch.zeros(1, dtype=torch.long, device=args.device)

if args.model == 'transformer':
    dummy_mask = torch.zeros(1, 1, 1, args.max_sent_length, device=args.device)
    model = Transformer(args, data_train)
elif args.model == 'lstm':
    dummy_mask = torch.zeros(1, args.max_sent_length, device=args.device)
    model = LSTM(args, data_train)

dev_batches = get_batches(data_dev, args.batch_size)
test_batches = get_batches(data_test, args.batch_size)

ptb = PerturbationSynonym(budget=args.budget)
dummy_embeddings = BoundedTensor(dummy_embeddings, ptb)
model_ori = model.model_from_embeddings
bound_opts = { 'activation_bound_option': args.bound_opts_relu, 'exp': 'no-max-input', 'fixed_reducemax_index': True }
if isinstance(model_ori, BoundedModule):
    model_bound = model_ori
else:
    model_bound = BoundedModule(
        model_ori, (dummy_embeddings, dummy_mask), bound_opts=bound_opts, device=args.device)
model.model_from_embeddings = model_bound
if args.loss_fusion:
    bound_opts['loss_fusion'] = True
    model_loss = BoundedModule(
        CrossEntropyWrapperMultiInput(model_ori),
        (torch.zeros(1, dtype=torch.long), dummy_embeddings, dummy_mask),
        bound_opts=bound_opts, device=args.device)

ptb.model = model
optimizer = model.build_optimizer()
if args.lr_decay < 1:
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=args.lr_decay)
else:
    lr_scheduler = None
if args.robust:
    eps_scheduler = LinearScheduler(args.eps, 'start={},length={}'.format(args.eps_start, args.eps_length))
    for i in range(model.checkpoint):
        eps_scheduler.step_epoch(verbose=False)
else:
    eps_scheduler = None
logger.info('Model converted to support bounds')

def step(model, ptb, batch, eps=1.0, train=False):
    model_bound = model.model_from_embeddings
    if train:
        model.train()
        model_bound.train()
        grad = torch.enable_grad()
        if args.loss_fusion:
            model_loss.train()
    else:
        model.eval()
        model_bound.eval()
        grad = torch.no_grad()
    if args.auto_test:
        grad = torch.enable_grad()

    with grad:
        ptb.set_eps(eps)
        ptb.set_train(train)
        embeddings_unbounded, mask, tokens, labels = model.get_input(batch)
        aux = (tokens, batch)
        if args.robust and eps > 1e-9:
            embeddings = BoundedTensor(embeddings_unbounded, ptb)
        else:
            embeddings = embeddings_unbounded.detach().requires_grad_(True)

        robust = args.robust and eps > 1e-6

        if train and robust and args.loss_fusion:
            # loss_fusion loss
            if args.method == 'IBP+backward_train':
                lb, ub = model_loss.compute_bounds(
                    x=(labels, embeddings, mask), aux=aux,
                    C=None, method='IBP+backward', bound_lower=False)
            else:
                raise NotImplementedError
            loss_robust = torch.log(ub).mean()
            loss = acc = acc_robust = -1 # unknown
        else:
            # regular loss
            logits = model_bound(embeddings, mask)
            loss = CrossEntropyLoss()(logits, labels)
            acc = (torch.argmax(logits, dim=1) == labels).float().mean()

            if robust:
                num_class = args.num_classes
                c = torch.eye(num_class).type_as(embeddings)[labels].unsqueeze(1) - \
                    torch.eye(num_class).type_as(embeddings).unsqueeze(0)
                I = (~(labels.data.unsqueeze(1) == torch.arange(num_class).type_as(labels.data).unsqueeze(0)))
                c = (c[I].view(embeddings.size(0), num_class - 1, num_class))
                if args.method in ['IBP', 'IBP+backward', 'forward', 'forward+backward']:
                    lb, ub = model_bound.compute_bounds(aux=aux, C=c, method=args.method, bound_upper=False)
                elif args.method == 'IBP+backward_train':
                    # CROWN-IBP
                    if 1 - eps > 1e-4:
                        lb, ub = model_bound.compute_bounds(aux=aux, C=c, method='IBP+backward', bound_upper=False)
                        ilb, iub = model_bound.compute_bounds(aux=aux, C=c, method='IBP', reuse_ibp=True)
                        lb = eps * ilb + (1 - eps) * lb
                    else:
                        lb, ub = model_bound.compute_bounds(aux=aux, C=c, method='IBP')
                else:
                    raise NotImplementedError
                lb_padded = torch.cat((torch.zeros(size=(lb.size(0),1), dtype=lb.dtype, device=lb.device), lb), dim=1)
                fake_labels = torch.zeros(size=(lb.size(0),), dtype=torch.int64, device=lb.device)
                loss_robust = robust_ce = CrossEntropyLoss()(-lb_padded, fake_labels)
                acc_robust = 1 - torch.mean((lb < 0).any(dim=1).float())
            else:
                acc_robust, loss_robust = acc, loss

    if train or args.auto_test:
        loss_robust.backward()
        grad_embed = torch.autograd.grad(
            embeddings_unbounded, model.word_embeddings.weight,
            grad_outputs=embeddings.grad)[0]
        if model.word_embeddings.weight.grad is None:
            model.word_embeddings.weight.grad = grad_embed
        else:
            model.word_embeddings.weight.grad += grad_embed

    if args.auto_test:
        print('Saving results for automated tests.')
        print(f'acc={acc}, loss={loss}, robust_acc={acc_robust}, robust_loss={loss_robust}')
        print('gradients:')
        print(grad_embed)
        with open('res_test.pkl', 'wb') as file:
            pickle.dump((
                float(acc), float(loss), float(acc_robust), float(loss_robust),
                grad_embed.detach().numpy()), file)

    return acc, loss, acc_robust, loss_robust

def train(epoch, batches, type):
    meter = MultiAverageMeter()
    assert(optimizer is not None)
    train = type == 'train'
    if args.robust:
        eps_scheduler.set_epoch_length(len(batches))
        if train:
            eps_scheduler.train()
            eps_scheduler.step_epoch()
        else:
            eps_scheduler.eval()
    for i, batch in enumerate(batches):
        if args.robust:
            eps_scheduler.step_batch()
            eps = eps_scheduler.get_eps()
        else:
            eps = 0
        acc, loss, acc_robust, loss_robust = step(
            model, ptb, batch, eps=eps, train=train)
        meter.update('acc', acc, len(batch))
        meter.update('loss', loss, len(batch))
        meter.update('acc_rob', acc_robust, len(batch))
        meter.update('loss_rob', loss_robust, len(batch))
        if train:
            if (i + 1) % args.gradient_accumulation_steps == 0 or (i + 1) == len(batches):
                scale_gradients(optimizer, i % args.gradient_accumulation_steps + 1, args.grad_clip)
                optimizer.step()
                optimizer.zero_grad()
            if lr_scheduler is not None:
                lr_scheduler.step()
            writer.add_scalar('loss_train_{}'.format(epoch), meter.avg('loss'), i + 1)
            writer.add_scalar('loss_robust_train_{}'.format(epoch), meter.avg('loss_rob'), i + 1)
            writer.add_scalar('acc_train_{}'.format(epoch), meter.avg('acc'), i + 1)
            writer.add_scalar('acc_robust_train_{}'.format(epoch), meter.avg('acc_rob'), i + 1)
        if (i + 1) % args.log_interval == 0 or (i + 1) == len(batches):
            logger.info('Epoch {}, {} step {}/{}: eps {:.5f}, {}'.format(
                epoch, type, i + 1, len(batches), eps, meter))
            if lr_scheduler is not None:
                logger.info('lr {}'.format(lr_scheduler.get_lr()))
    writer.add_scalar('loss/{}'.format(type), meter.avg('loss'), epoch)
    writer.add_scalar('loss_robust/{}'.format(type), meter.avg('loss_rob'), epoch)
    writer.add_scalar('acc/{}'.format(type), meter.avg('acc'), epoch)
    writer.add_scalar('acc_robust/{}'.format(type), meter.avg('acc_rob'), epoch)

    if train:
        if args.loss_fusion:
            state_dict_loss = model_loss.state_dict()
            state_dict = {}
            for name in state_dict_loss:
                assert(name.startswith('model.'))
                state_dict[name[6:]] = state_dict_loss[name]
            model_ori.load_state_dict(state_dict)
            model_bound = BoundedModule(
                model_ori, (dummy_embeddings, dummy_mask), bound_opts=bound_opts, device=args.device)
            model.model_from_embeddings = model_bound
        model.save(epoch)

    return meter.avg('acc_rob')

def main():
    if args.train:
        for t in range(model.checkpoint, args.num_epochs):
            if t + 1 <= args.num_epochs_all_nodes:
                train(t + 1, get_batches(data_train_all_nodes, args.batch_size), 'train')
            else:
                train(t + 1, get_batches(data_train, args.batch_size), 'train')
            train(t + 1, dev_batches, 'dev')
            train(t + 1, test_batches, 'test')
    elif args.oracle:
        oracle(args, model, ptb, data_test, 'test')
    else:
        if args.robust:
            for i in range(args.num_epochs):
                eps_scheduler.step_epoch(verbose=False)
            res = []
            for i in range(1, args.budget + 1):
                logger.info('budget {}'.format(i))
                ptb.budget = i
                acc_rob = train(None, test_batches, 'test')
                res.append(acc_rob)
            logger.info('Verification results:')
            for i in range(len(res)):
                logger.info('budget {} acc_rob {:.3f}'.format(i + 1, res[i]))
            logger.info(res)
        else:
            train(None, test_batches, 'test')

if __name__ == '__main__':
    main()


================================================
FILE: examples/sequence/.gitignore
================================================
model/
data/

================================================
FILE: examples/sequence/__init__.py
================================================


================================================
FILE: examples/sequence/data_utils.py
================================================
import random
from torchvision import transforms
from torchvision.datasets.mnist import MNIST as mnist

def load_data():
    transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
    data_train = mnist("data", train=True, download=True, transform=transform)
    data_test = mnist("data", train=False, download=True, transform=transform)
    data_train = [data_train[i] for i in range(len(data_train))]
    data_test = [data_test[i] for i in range(len(data_test))]
    return data_train, data_test

def get_batches(data, batch_size):
    batches = []
    random.shuffle(data)
    for i in range((len(data) + batch_size - 1) // batch_size):
        batches.append(data[i * batch_size : (i + 1) * batch_size])
    return batches

================================================
FILE: examples/sequence/lstm.py
================================================
import os
import shutil
import torch
import torch.nn as nn
from auto_LiRPA.utils import logger

class LSTMCore(nn.Module):
    def __init__(self, args):
        super(LSTMCore, self).__init__()

        self.input_size = args.input_size // args.num_slices
        self.hidden_size = args.hidden_size
        self.num_classes = args.num_classes
        self.device = args.device

        self.cell_f = nn.LSTMCell(self.input_size, self.hidden_size)
        self.linear = nn.Linear(self.hidden_size, self.num_classes)

    def forward(self, X):
        batch_size, length = X.shape[0], X.shape[1]
        h_f = torch.zeros(batch_size, self.hidden_size).to(X.device)
        c_f = h_f.clone()
        h_f_sum = h_f.clone()
        for i in range(length):
            h_f, c_f = self.cell_f(X[:, i], (h_f, c_f))
            h_f_sum = h_f_sum + h_f
        states = h_f_sum / float(length)
        logits = self.linear(states)
        return logits

class LSTM(nn.Module):
    def __init__(self, args):
        super(LSTM, self).__init__()
        self.args = args
        self.device = args.device
        self.lr = args.lr
        self.num_slices = args.num_slices

        self.dir = args.dir
        if not os.path.exists(self.dir):
            os.makedirs(self.dir)
        self.checkpoint = 0
        self.model = LSTMCore(args)
        if args.load:
            self.model.load_state_dict(args.load)
            logger.info(f"Model loaded: {args.load}")
        else:
            logger.info("Model initialized")
        self.model = self.model.to(self.device)
        self.core = self.model

    def save(self, epoch):
        output_dir = os.path.join(self.dir, "ckpt-%d" % epoch)
        if os.path.exists(output_dir):
            shutil.rmtree(output_dir)
        os.mkdir(output_dir)
        path = os.path.join(output_dir, "model")
        torch.save(self.core.state_dict(), path)
        with open(os.path.join(self.dir, "checkpoint"), "w") as file:
            file.write(str(epoch))
        logger.info("LSTM saved: %s" % output_dir)

    def build_optimizer(self):
        param_group = []
        for p in self.core.named_parameters():
            param_group.append(p)
        param_group = [{"params": [p[1] for p in param_group], "weight_decay": 0.}]
        return torch.optim.Adam(param_group, lr=self.lr)

    def get_input(self, batch):
        X = torch.cat([example[0].reshape(1, self.num_slices, -1) for example in batch])
        y = torch.tensor([example[1] for example in batch], dtype=torch.long)
        return X.to(self.device), y.to(self.device)

    def train(self):
        self.core.train()

    def eval(self):
        self.core.eval()

================================================
FILE: examples/sequence/train.py
================================================
import argparse
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from lstm import LSTM
from data_utils import load_data, get_batches
from auto_LiRPA import BoundedModule, BoundedTensor, PerturbationLpNorm
from auto_LiRPA.utils import MultiAverageMeter, logger, get_spec_matrix

parser = argparse.ArgumentParser()
parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--load", type=str, default=None)
parser.add_argument("--device", type=str, default="cuda", choices=["cuda", "cpu"])
parser.add_argument("--norm", type=int, default=np.inf)
parser.add_argument("--eps", type=float, default=0.1)
parser.add_argument("--num_epochs", type=int, default=20)
parser.add_argument("--batch_size", type=int, default=512)
parser.add_argument("--num_slices", type=int, default=8)
parser.add_argument("--hidden_size", type=int, default=256)
parser.add_argument("--num_classes", type=int, default=10)
parser.add_argument("--input_size", type=int, default=784)
parser.add_argument("--lr", type=float, default=1e-2)
parser.add_argument("--dir", type=str, default="model", help="directory to load or save the model")
parser.add_argument("--num_epochs_warmup", type=int, default=10, help="number of epochs for the warmup stage when eps is linearly increased from 0 to the full value")
parser.add_argument("--log_interval", type=int, default=10, help="interval of printing the log during training")
args = parser.parse_args()


## Train or test one batch.
def step(model, ptb, batch, eps=args.eps, train=False):
    # We increase the perturbation each batch.
    ptb.set_eps(eps)
    # We create a BoundedTensor object with current batch of data.
    X, y = model.get_input(batch)
    X = BoundedTensor(X, ptb)
    logits = model.core(X)

    # Form the linear speicifications, which are margins of ground truth class and other classes.
    num_class = args.num_classes
    c = get_spec_matrix(X, y, num_class)

    # Compute CROWN-IBP (IBP+backward) bounds for training. We only need the lower bound.
    # Here we can omit the x=(X,) argument because we have just used X for forward propagation.
    lb, ub = model.core.compute_bounds(C=c, method='CROWN-IBP', bound_upper=False)

    # Compute robust cross entropy loss.
    lb_padded = torch.cat((torch.zeros(size=(lb.size(0),1), dtype=lb.dtype, device=lb.device), lb), dim=1)
    fake_labels = torch.zeros(size=(lb.size(0),), dtype=torch.int64, device=lb.device)
    loss = nn.CrossEntropyLoss()(-lb_padded, fake_labels)

    # Report accuracy and robust accuracy.
    acc = (torch.argmax(logits, dim=-1) == y).float().mean()
    acc_robust = 1 - torch.mean((lb < 0).any(dim=1).float())

    if train:
        loss.backward()

    return acc.detach(), acc_robust.detach(), loss.detach()


## Train one epoch.
def train(epoch):
    meter = MultiAverageMeter()
    model.train()
    # Load data for a epoch.
    train_batches = get_batches(data_train, args.batch_size)

    eps_inc_per_step = 1.0 / (args.num_epochs_warmup * len(train_batches))

    for i, batch in enumerate(train_batches):
        # We increase eps linearly every batch.
        eps = args.eps * min(eps_inc_per_step * ((epoch - 1) * len(train_batches) + i + 1), 1.0)
        # Call the main training loop.
        acc, acc_robust, loss = step(model, ptb, batch, eps=eps, train=True)
        # Optimize the loss.
        torch.nn.utils.clip_grad_norm_(model.core.parameters(), 5.0)
        optimizer.step()
        optimizer.zero_grad()
        meter.set_batch_size(len(batch))
        meter.update('acc', acc)
        meter.update('acc_rob', acc_robust)
        meter.update('loss', loss)
        if (i + 1) % args.log_interval == 0:
            logger.info("Epoch {}, training step {}/{}: {}, eps {:.3f}".format(
                epoch, i + 1, len(train_batches), meter, eps))
    model.save(epoch)


## Test accuracy and robust accuracy.
def test(epoch, batches):
    meter = MultiAverageMeter()
    model.eval()
    for batch in batches:
        acc, acc_robust, loss = step(model, ptb, batch)
        meter.set_batch_size(len(batch))
        meter.update('acc', acc)
        meter.update('acc_rob', acc_robust)
        meter.update('loss', loss)
    logger.info("Epoch {} test: {}".format(epoch, meter))

# Load MNIST dataset
logger.info("Loading data...")
data_train, data_test = load_data()
logger.info("Dataset sizes: {}/{}".format(len(data_train), len(data_test)))
test_batches = get_batches(data_test, args.batch_size)

# Set all random seeds.
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)

# Create a LSTM sequence classifier.
logger.info("Creating LSTM model...")
model = LSTM(args).to(args.device)
X, y = model.get_input(test_batches[0])
# Create the perturbation object once here, and we can reuse it.
ptb = PerturbationLpNorm(norm=args.norm, eps=args.eps)
# Convert the LSTM to BoundedModule
X = BoundedTensor(X, ptb)
model.core = BoundedModule(model.core, (X,), device=args.device)
optimizer = model.build_optimizer()

# Main training loop.
for t in range(model.checkpoint, args.num_epochs):
    train(t + 1)
    test(t + 1, test_batches)

# If the loaded model has already reached the last epoch, test it directly.
if model.checkpoint == args.num_epochs:
    test(args.num_epochs, test_batches)


================================================
FILE: examples/simple/invprop.py
================================================
"""
A toy example for bounding neural network outputs under input perturbations using INVPROP

See https://arxiv.org/abs/2302.01404
"""
import torch
from collections import defaultdict
from auto_LiRPA import BoundedModule, BoundedTensor
from auto_LiRPA.perturbations import PerturbationLpNorm

class simple_model(torch.nn.Module):
    """
    A very simple 2-layer neural network for demonstration.
    """
    def __init__(self):
        super().__init__()
        # Weights of linear layers.
        self.w1 = torch.tensor([[1., -1.], [2., -1.]])
        self.w2 = torch.tensor([[1., -1.]])

    def forward(self, x):
        # Linear layer.
        z1 = x.matmul(self.w1.t())
        # Relu layer.
        hz1 = torch.nn.functional.relu(z1)
        # Linear layer.
        z2 = hz1.matmul(self.w2.t())
        return z2


model = simple_model()

# Input x.
x = torch.tensor([[1., 1.]])
# Lowe and upper bounds of x.
lower = torch.tensor([[-1., -2.]])
upper = torch.tensor([[2., 1.]])

# Compute bounds using LiRPA using the given lower and upper bounds.
norm = float("inf")
ptb = PerturbationLpNorm(norm = norm, x_L=lower, x_U=upper)
bounded_x = BoundedTensor(x, ptb)

# INVPROP configuration
# apply_output_constraints_to: list of layer names or types to which the output
#     constraints should be applied. Here, they will be applied to all layers of type
#     'BoundMatMul' and 'BoundInput'. To only apply them to specific layers, use their
#     names, e.g. ['/0', '/z1']. The currently recommended way to get those names is
#     either to first construct an instance of BoundedModule with arbitrary bound_opts,
#     print it to stdout and inspect their names manually, or to access the layer names
#     as lirpa_model.final_node().inputs[0].inputs[0].name
# tighten_input_bounds: whether to tighten the input bounds. This will modify the
#     perturbation of the input. If set, apply_output_constraints_to should contain
#     'BoundInput' or the corresponding layer name. Otherwise, this will have no effect.
#     Similiar, adding 'BoundInput' to apply_output_constraints_to will have no effect
#     unless tighten_input_bounds is set.
# best_of_oc_and_no_oc: Using output constraints may sometimes lead to worse results,
#     because the optimization might find bad local minima. If this is set to True,
#     every optimization step will be run twice, once with and once without output
#     constraints, and the better result will be chosen.
# directly_optimize: Usually, only linear layers preceeding non-linear layers are
#     optimized using output constraints. If you want to optimize a specific layer that
#     would usually be skipped, add it's name to this list. This is most likely to be
#     used when preimages should be computed as they might use linear combinations of
#     the inputs. This requires the use of sequential linear layers. For detailed
#     examples, see https://github.com/kothasuhas/verify-input
# oc_lr: Learning rate for the optimization of output constraints.
# share_gammas: Whether neurons in each layer should share the same gamma

lirpa_model = BoundedModule(model, torch.empty_like(x), bound_opts={
    'optimize_bound_args': {
        'apply_output_constraints_to': ['BoundMatMul', 'BoundInput'],
        'tighten_input_bounds': True,
        'best_of_oc_and_no_oc': False,
        'directly_optimize': [],
        'oc_lr': 0.1,
        'share_gammas': False,
        'iteration': 1000,
    }
})
# To dynamically set the apply_output_constraints_to option, set it to `[]` in the
# above code, and then use the following:
# lirpa_model.set_bound_opts({
#   'optimize_bound_args': {
#     'apply_output_constraints_to': [
#       lirpa_model.final_node().inputs[0].inputs[0].inputs[0].name,
#       lirpa_model.final_node().inputs[0].inputs[0].name,
#     ]
#   }
# })

# The scalar output must be <= -1
# Constraints have the shape [1, num_constraints, num_output_neurons]
# They are treated as conjunctions, i.e., all constraints must be satisfied.
lirpa_model.constraints = torch.ones(1,1,1)
# Thresholds have the shape [num_constraints]
lirpa_model.thresholds = torch.tensor([-1.])

print(f"Original perturbation: x0: [{ptb.x_L[0][0]}, {ptb.x_U[0][0]}], x1: [{ptb.x_L[0][1]}, {ptb.x_U[0][1]}]")
lb, ub = lirpa_model.compute_bounds(x=(bounded_x,), method='alpha-CROWN')
tightened_ptb = lirpa_model['/0'].perturbation
print(f"Tightened perturbation: x0: [{tightened_ptb.x_L[0][0]}, {tightened_ptb.x_U[0][0]}], x1: [{tightened_ptb.x_L[0][1]}, {tightened_ptb.x_U[0][1]}]")

# For the bounds without output constraints, refer to toy.py
print(f'alpha-CROWN bounds without output constraints: lower=-3, upper=2')
print(f'alpha-CROWN bounds with output constraints: lower={lb.item()}, upper={ub.item()}')

================================================
FILE: examples/simple/lp_full.py
================================================
"""
A simple example for bounding neural network outputs using LP/MIP solvers.

Auto_LiRPA supports constructing LP/MIP optimization formulations (using
Gurobi).  This example uses LP to solve all intermediate layer bounds and
final layer bounds, reflecting the setting in the paper "A Convex
Relaxation Barrier to Tight Robustness Verification of Neural Networks".
This is sometimes referred to as the LP-Full setting. This is in general,
very slow; alpha-CROWN is generally recommended to compute intermediate
layer bound rather than LP.

Example usage: python lp_full.py --index 0 --norm 2.0 --perturbation 1.0

Here `--index` is the dataset index (MNIST in this example), `--norm` is
the Lp perturbation norm used and `--perturbation` is the magnitude of
the perturbation added to model input.
"""

import torch
import torch.nn as nn
import torchvision
from auto_LiRPA import BoundedModule, BoundedTensor
from auto_LiRPA.perturbations import PerturbationLpNorm
from auto_LiRPA.operators import BoundLinear, BoundConv
import gurobipy as grb
import time
import numpy as np
import argparse

# Help function for generating output matrix. This function used for 
# generating matrix C to calculate the margin between true class and 
# the other classes.
def build_C(label, classes):
    """
    label: shape (B,). Each label[b] in [0..classes-1].
    Return:
        C: shape (B, classes-1, classes).
        For each sample b, each row is a "negative class" among [0..classes-1]\{label[b]}.
        Puts +1 at column=label[b], -1 at each negative class column.
    """
    device = label.device
    batch_size = label.size(0)
    
    # 1) Initialize
    C = torch.zeros((batch_size, classes-1, classes), device=device)
    
    # 2) All class indices
    # shape: (1, K) -> (B, K)
    all_cls = torch.arange(classes, device=device).unsqueeze(0).expand(batch_size, -1)
    
    # 3) Negative classes only, shape (B, K-1)
    # mask out the ground-truth
    mask = all_cls != label.unsqueeze(1)
    neg_cls = all_cls[mask].view(batch_size, -1)
    
    # 4) Scatter +1 at each sample’s ground-truth label
    #    shape needed: (B, K-1, 1)
    pos_idx = label.unsqueeze(1).expand(-1, classes-1).unsqueeze(-1)
    C.scatter_(dim=2, index=pos_idx, value=1.0)
    
    # 5) Scatter -1 at each row’s negative label
    #    We have (B, K-1) negative labels. For row j in each sample b, neg_cls[b, j] is that row’s negative label
    row_idx = torch.arange(classes-1, device=device).unsqueeze(0).expand(batch_size, -1)
    # shape: (B, K-1)
    
    # We can do advanced indexing:
    C[torch.arange(batch_size).unsqueeze(1), row_idx, neg_cls] = -1.0
    
    return C
    
parser = argparse.ArgumentParser()
parser.add_argument('--index', default=0, type=int, help='Index of data example (from MNIST dataset).')
parser.add_argument('--norm', default='inf', type=str, help='Input perturbation norm.')
parser.add_argument('--perturbation', default=0.05, type=float, help='Input perturbation magnitude.')
parser.add_argument('--lr', default=0.5, type=float, help='Learning rate for alpha_crown.')
parser.add_argument('--iteration', default=30, type=int, help='Iterations for alpha_crown.')
args = parser.parse_args()

## Step 1: Define computational graph by implementing forward()
# You can create your own model here.
model = nn.Sequential(
		nn.Flatten(),
		nn.Linear(784, 100),
		nn.ReLU(),
		nn.Linear(100, 100),
		nn.ReLU(),
		nn.Linear(100, 10)
	)
# Optionally, load the pretrained weights.
checkpoint = torch.load('./models/spectral_NOR_MLP_B.pth', weights_only=True)
model.load_state_dict(checkpoint)

## Step 2: Prepare dataset.
test_data = torchvision.datasets.MNIST(
    './data', train=False, download=True,
    transform=torchvision.transforms.ToTensor())

n_classes = 10
image = test_data.data[args.index].to(torch.float32).unsqueeze(0).unsqueeze(0) / 255.0
true_label = torch.tensor([test_data.targets[args.index]])

## Step 3: Define perturbation.
eps = args.perturbation
norm = float(args.norm)
# The upper bound and lower bound of mnist dataset is [0,1],
# replace the bounds if using other dataset.
if norm == float('inf'):
    x_U = None
    x_L = None
else:
    x_U = torch.ones_like(image)
    x_L = torch.zeros_like(image)
ptb = PerturbationLpNorm(norm = norm, eps = eps, x_U = x_U, x_L = x_L)
print(f'Verification of MNIST data index {args.index} with L{args.norm} perturbation of {args.perturbation}\n')
# Here we only use one image as input.
image = BoundedTensor(image, ptb)
print('Running LP-Full with LPs for all intermediate layers...')
start_time = time.time()

## Step 4: Compute the bounds of different methods.
# For CROWN/alpha-CROWN, we use the compute_bounds() method.
# For LP and MIP, we use the build_solver_module() method.
interm_bounds = {}
lirpa_model = BoundedModule(model, image, device=image.device)
# Store the output shape for each layer first
for node in lirpa_model.nodes():
    # For each intermediate layers, we first set their bound to be infinity as placeholder.
    if hasattr(node, 'output_shape'):
        interm_lb = torch.full(node.output_shape, -float('inf'))
        interm_ub = torch.full(node.output_shape, float('inf'))
        interm_bounds[node.name] = [interm_lb, interm_ub]

# C is the specification matrix (groundtruth - target class).
C = build_C(true_label, classes=n_classes)
# Here we assume that the last node is the model output, and we start from intermdiate layers first.
# Technically, here we need a topological sort of all model nodes if the computation graph is general.
for node in lirpa_model.nodes():
    # For simplicity, we assume the model contains linear, conv, and ReLU layers.
    # We need to calculate the preactivation bounds before each ReLU layer, which are the bounds for linear of conv layers.
    if isinstance(node, (BoundLinear, BoundConv)):
        interm_lb = torch.full(node.output_shape, -float('inf'))
        interm_ub = torch.full(node.output_shape, float('inf'))
        if node.is_final_node:
            print(f'Solving LPs for final layer bounds...')
            # Last node, all intermediate layer bounds have been obtained.
            # For last node, we need to use the specification matrix C to calculate the bounds on groundtruth - target labels.
            solver_vars = lirpa_model.build_solver_module(model_type='lp', x=(image,), final_node_name=node.name, interm_bounds=interm_bounds, C=C)
            lirpa_model.solver_model.setParam('OutputFlag', 0)
            final_lb = torch.empty(n_classes-1)
            final_ub = torch.empty(n_classes-1)
            for i in range(n_classes-1):
                print(f'Solving class {i}...')
                # Now you can define objectives based on the variables on the output layer.
                # And then solve them using gurobi. Here we just output the lower and upper
                # bounds for each output neuron.
                # Solve upper bound.
                lirpa_model.solver_model.setObjective(solver_vars[i], grb.GRB.MAXIMIZE)
                lirpa_model.solver_model.optimize()
                # If the solver does not terminate, you will get a NaN.
                if lirpa_model.solver_model.status == grb.GRB.Status.OPTIMAL:
                    final_ub[i] = lirpa_model.solver_model.objVal
                # Solve lower bound.
                lirpa_model.solver_model.setObjective(solver_vars[i], grb.GRB.MINIMIZE)
                lirpa_model.solver_model.optimize()
                if lirpa_model.solver_model.status == grb.GRB.Status.OPTIMAL:
                    final_lb[i] = lirpa_model.solver_model.objVal
        else:
            print(f'Solving LPs for layer {node.name} intermediate layer bounds...')
            # Solve intermediate layer bounds, one by one.
            solver_vars = lirpa_model.build_solver_module(model_type='lp', x=(image,), final_node_name=node.name, interm_bounds=interm_bounds)
            lirpa_model.solver_model.setParam('OutputFlag', 0)
            # For linear layer, the solver_vars shape is: (neurons).
            if isinstance(node, BoundLinear):
                for i, var in enumerate(solver_vars):
                    lirpa_model.solver_model.setObjective(var, grb.GRB.MAXIMIZE)
                    lirpa_model.solver_model.optimize()
                    if lirpa_model.solver_model.status == grb.GRB.Status.OPTIMAL:
                        interm_ub[0][i] = lirpa_model.solver_model.objVal
                    # Solve lower bound.
                    lirpa_model.solver_model.setObjective(var, grb.GRB.MINIMIZE)
                    lirpa_model.solver_model.optimize()
                    if lirpa_model.solver_model.status == grb.GRB.Status.OPTIMAL:
                        interm_lb[0][i] = lirpa_model.solver_model.objVal
            # For convolutional layer, the solver_vars shape is (channel, out_w, out_h).
            elif isinstance(node, BoundConv):
                for i,channel in enumerate(solver_vars):
                    for j, row in enumerate(channel):
                        for k, var in enumerate(row):
                            lirpa_model.solver_model.setObjective(var, grb.GRB.MAXIMIZE)
                            lirpa_model.solver_model.optimize()
                            if lirpa_model.solver_model.status == grb.GRB.Status.OPTIMAL:
                                interm_ub[0][i][j][k] = lirpa_model.solver_model.objVal
                            # Solve lower bound.
                            lirpa_model.solver_model.setObjective(var, grb.GRB.MINIMIZE)
                            lirpa_model.solver_model.optimize()
                            if lirpa_model.solver_model.status == grb.GRB.Status.OPTIMAL:
                                interm_lb[0][i][j][k] = lirpa_model.solver_model.objVal
            interm_bounds[node.name] = [interm_lb, interm_ub]
        print(f'Finished solving layer {node.name} with {len(solver_vars)} neurons')
end_time = time.time()
lp_time = end_time - start_time
print(f'LP-Full time: {lp_time}\n')

lirpa_model = BoundedModule(model, torch.empty_like(image), device=image.device)
lirpa_model.set_bound_opts({'optimize_bound_args': {'iteration': args.iteration, 'lr_alpha': args.lr}})
start_time = time.time()
print(f'Running alpha-CROWN with {args.iteration} iterations and learning rate of {args.lr}...')
crown_lb, crown_ub = lirpa_model.compute_bounds(x=(image, ), C=C, method='alpha-CROWN')
end_time = time.time()
alpha_crown_time = end_time - start_time
print(f'alpha-CROWN time: {alpha_crown_time}')

# Step 5: output the final results of each method.
print(f'\nResults for dataset index: {args.index}')
print(f'LP-Full bounds:')
for i in range(n_classes-1):
    if i == true_label.item():
        label = i + 1
    else:
        label = i
    print('{l:8.3f} <= f_{k} - f_{j} <= {u:8.3f}'.format(
        k=true_label.item(), j=label, l=final_lb[i].item(), u=final_ub[i].item()))

# Alpha-CROWN should achieve similar results as LP full but without running any LPs.
print(f'\nalpha-CROWN bounds:')
for i in range(n_classes-1):
    if i == true_label.item():
        label = i + 1
    else:
        label = i
    print('{l:8.3f} <= f_{k} - f_{j} <= {u:8.3f}'.format(
        k=true_label.item(), j=label, l=crown_lb[0][i].item(), u=crown_ub[0][i].item()))
print(f'alpha-CROWN bounds and LP-full bounds should be close for Linf norm; '
      'adjust the number of iterations and learning rate when necessary.\n')


================================================
FILE: examples/simple/mip_lp_solver.py
================================================
"""
A simple example for bounding neural network outputs using LP/MIP solvers.

Auto_LiRPA supports constructing LP/MIP optimization formulations (using
Gurobi).  This example serves as a skeleton for using the build_solver_module()
method to obtain LP/MIP formulations of neural networks.

Note that alpha-CROWN is used to calculate intermediate layer bounds for
constructing the convex relaxation of ReLU neurons. So we are actually using
"alpha-CROWN+MIP" or "alpha-CROWN+LP" here. Calculating intermediate layer
bounds using LP/MIP is often impractical due to the high cost.
"""
import torch
import torch.nn as nn
import torchvision
from auto_LiRPA import BoundedModule, BoundedTensor
from auto_LiRPA.perturbations import PerturbationLpNorm
import gurobipy as grb

## Step 1: Define computational graph by implementing forward()
# You can create your own model here.
class mnist_model(nn.Module):
    def __init__(
            self, input_size=28*28, hidden_size=128,
            hidden_size_2=64, output_size=10):
        super(mnist_model, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size_2)
        self.fc3 = nn.Linear(hidden_size_2, output_size)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = x.view(-1, 784)
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.fc3(out)
        return out

model = mnist_model()
# Optionally, load the pretrained weights.
checkpoint = torch.load('../vision/pretrained/mnist_fc_3layer.pth')
model.load_state_dict(checkpoint)

## Step 2: Prepare dataset.
test_data = torchvision.datasets.MNIST(
    './data', train=False, download=True,
    transform=torchvision.transforms.ToTensor())
# For illustration we only use 1 image from dataset.
N = 1
n_classes = 10
image = test_data.data[:N].view(N, 1, 28, 28)
true_label = test_data.targets[:N]
image = image.to(torch.float32) / 255.0

## Step 3: Define perturbation.
eps = 0.03
norm = float("inf")
ptb = PerturbationLpNorm(norm = norm, eps = eps)
# Here we only use one image as input
image = BoundedTensor(image[0], ptb)

## Step 4: Compute the bounds of different methods.
# For CROWN/alpha-CROWN, we use the compute_bounds() method.
# For LP and MIP, we use the build_solver_module() method.
result = {}
# Note that here 'lp' or 'mip' are essentially 'alpha-CROWN+lp' and 'alpha-CROWN+mip'.
# We use alpha-CROWN to calculate all the intermediate layer bounds for LP/MIP, because
# using MIP/LP for all intermediate neurons will be very slow.
for method in ['alpha-CROWN','lp','mip']:
    # To get clean results and avoid interference among methods, we create a
    # new BoundedModule object.  However, in your production code please pay
    # attention that BoundedModule() has high construction overhead.
    lirpa_model = BoundedModule(model, torch.empty_like(image[0]), device=image.device)
    # Call alpha-CROWN first, which gives all intermediate layer bounds.
    lb, ub = lirpa_model.compute_bounds(x=(image,), method='alpha-CROWN')

    if method != 'alpha-CROWN':
        lb = torch.full_like(lb, float('nan'))
        ub = torch.full_like(ub, float('nan'))
        # Obtain the optimizer (Gurobi) variables for the output layer.
        # Auto_LiRPA will construct the LP/MIP formulation based on computation graph.
        # Note that pre-activation bounds are required for using this function.
        # Preactivation bounds have been computed using alpha-CROWN above.
        solver_vars = lirpa_model.build_solver_module(model_type=method)
        # Set some parameters for Gurobi optimizer.
        lirpa_model.solver_model.setParam('OutputFlag', 0)
        for i in range(n_classes):
            print(f'Solving class {i} with method {method}')
            # Now you can define objectives based on the variables on the output layer.
            # And then solve them using gurobi. Here we just output the lower and upper
            # bounds for each output neuron.
            # Solve upper bound.
            lirpa_model.solver_model.setObjective(solver_vars[i], grb.GRB.MAXIMIZE)
            lirpa_model.solver_model.optimize()
            # If the solver does not terminate, you will get a NaN.
            if lirpa_model.solver_model.status == grb.GRB.Status.OPTIMAL:
                ub[0][i] = lirpa_model.solver_model.objVal
            # Solve lower bound.
            lirpa_model.solver_model.setObjective(solver_vars[i], grb.GRB.MINIMIZE)
            lirpa_model.solver_model.optimize()
            if lirpa_model.solver_model.status == grb.GRB.Status.OPTIMAL:
                lb[0][i] = lirpa_model.solver_model.objVal
    result[method] = (lb, ub)

# Step 5: output the final results of each method.
for method in result.keys():    
    print(f'Bounding method: {method}')
    lb, ub = result[method]
    for i in range(n_classes):
        print('f_{j}(x_0): {l:8.3f} <= f_{j}(x_0+delta) <= {u:8.3f}'.format(
            j=i, l=lb[0][i].item(), u=ub[0][i].item()))


================================================
FILE: examples/simple/toy.py
================================================
"""
A toy example for bounding neural network outputs under input perturbations.
"""
import torch
from collections import defaultdict
from auto_LiRPA import BoundedModule, BoundedTensor
from auto_LiRPA.perturbations import PerturbationLpNorm

class simple_model(torch.nn.Module):
    """
    A very simple 2-layer neural network for demonstration.
    """
    def __init__(self):
        super().__init__()
        # Weights of linear layers.
        self.w1 = torch.tensor([[1., -1.], [2., -1.]])
        self.w2 = torch.tensor([[1., -1.]])

    def forward(self, x):
        # Linear layer.
        z1 = x.matmul(self.w1.t())
        # Relu layer.
        hz1 = torch.nn.functional.relu(z1)
        # Linear layer.
        z2 = hz1.matmul(self.w2.t())
        return z2


model = simple_model()

# Input x.
x = torch.tensor([[1., 1.]])
# Lowe and upper bounds of x.
lower = torch.tensor([[-1., -2.]])
upper = torch.tensor([[2., 1.]])

# Wrap model with auto_LiRPA for bound computation.
# The second parameter is for constructing the trace of the computational graph,
# and its content is not important.
lirpa_model = BoundedModule(model, torch.empty_like(x))
pred = lirpa_model(x)
print(f'Model prediction: {pred.item()}')

# Compute bounds using LiRPA using the given lower and upper bounds.
norm = float("inf")
ptb = PerturbationLpNorm(norm = norm, x_L=lower, x_U=upper)
bounded_x = BoundedTensor(x, ptb)

# Compute bounds.
lb, ub = lirpa_model.compute_bounds(x=(bounded_x,), method='IBP')
print(f'IBP bounds: lower={lb.item()}, upper={ub.item()}')
lb, ub = lirpa_model.compute_bounds(x=(bounded_x,), method='CROWN')
print(f'CROWN bounds: lower={lb.item()}, upper={ub.item()}')

# Getting the linear bound coefficients (A matrix).
required_A = defaultdict(set)
required_A[lirpa_model.output_name[0]].add(lirpa_model.input_name[0])
lb, ub, A = lirpa_model.compute_bounds(x=(bounded_x,), method='CROWN', return_A=True, needed_A_dict=required_A)
print('CROWN linear (symbolic) bounds: lA x + lbias <= f(x) <= uA x + ubias, where')
print(A[lirpa_model.output_name[0]][lirpa_model.input_name[0]])

# Opimized bounds, which is tighter.
lb, ub, A = lirpa_model.compute_bounds(x=(bounded_x,), method='alpha-CROWN', return_A=True, needed_A_dict=required_A)
print(f'alpha-CROWN bounds: lower={lb.item()}, upper={ub.item()}')
print('alpha-CROWN linear (symbolic) bounds: lA x + lbias <= f(x) <= uA x + ubias, where')
print(A[lirpa_model.output_name[0]][lirpa_model.input_name[0]])


================================================
FILE: examples/vision/.gitignore
================================================
exp
exp_inv
__pycache__
model_*
!model_gurobi.py
saved_models
config


================================================
FILE: examples/vision/bound_option.py
================================================
"""
A simple example for bounding neural network outputs with different bound options on ReLU activation functions.

"""
import os
from collections import defaultdict
import torch
import torch.nn as nn
import torchvision
from auto_LiRPA import BoundedModule, BoundedTensor
from auto_LiRPA.perturbations import PerturbationLpNorm
from auto_LiRPA.utils import Flatten

## Step 1: Define computational graph by implementing forward()
# This simple model comes from https://github.com/locuslab/convex_adversarial
def mnist_model():
    model = nn.Sequential(
        nn.Conv2d(1, 16, 4, stride=2, padding=1),
        nn.ReLU(),
        nn.Conv2d(16, 32, 4, stride=2, padding=1),
        nn.ReLU(),
        Flatten(),
        nn.Linear(32*7*7,100),
        nn.ReLU(),
        nn.Linear(100, 10)
    )
    return model

model = mnist_model()
# Optionally, load the pretrained weights.
checkpoint = torch.load(
    os.path.join(os.path.dirname(__file__), 'pretrained/mnist_a_adv.pth'),
    map_location=torch.device('cpu'))
model.load_state_dict(checkpoint)

## Step 2: Prepare dataset as usual
test_data = torchvision.datasets.MNIST(
    './data', train=False, download=True,
    transform=torchvision.transforms.ToTensor())
# For illustration we only use one image from dataset
N = 1
n_classes = 10
image = test_data.data[:N].view(N,1,28,28)
true_label = test_data.targets[:N]
# Convert to float
image = image.to(torch.float32) / 255.0
if torch.cuda.is_available():
    image = image.cuda()
    model = model.cuda()

## Step 3: wrap model with auto_LiRPA
# Use default bound_option
lirpa_model_default = BoundedModule(model, torch.empty_like(image), device=image.device)
# Use same-slope option for ReLU functions
lirpa_model_sameslope = BoundedModule(model, torch.empty_like(image), device=image.device, 
                                      bound_opts={'activation_bound_option': 'same-slope'})
print('Running on', image.device)

## Step 4: Compute bounds using LiRPA given a perturbation
eps = 0.3
norm = float("inf")
ptb = PerturbationLpNorm(norm = norm, eps = eps)
image = BoundedTensor(image, ptb)
# Get model prediction as usual
pred = lirpa_model_default(image)
label = torch.argmax(pred, dim=1).cpu().detach().numpy()

print()
print('Demonstration 1.1: Bound computation and comparisons of different options.')
## Step 5: Compute bounds for final output
print('Bounding method:', 'backward (CROWN)')
print('Bounding option:', 'Default (adaptive)')
lb, ub = lirpa_model_default.compute_bounds(x=(image,), method='backward')
for i in range(N):
    print(f'Image {i} top-1 prediction {label[i]} ground-truth {true_label[i]}')
    for j in range(n_classes):
        indicator = '(ground-truth)' if j == true_label[i] else ''
        print('f_{j}(x_0): {l:8.3f} <= f_{j}(x_0+delta) <= {u:8.3f} {ind}'.format(
            j=j, l=lb[i][j].item(), u=ub[i][j].item(), ind=indicator))
print()

print('Bounding option:', 'same-slope')
lb, ub = lirpa_model_sameslope.compute_bounds(x=(image,), method='backward')
for i in range(N):
    print(f'Image {i} top-1 prediction {label[i]} ground-truth {true_label[i]}')
    for j in range(n_classes):
        indicator = '(ground-truth)' if j == true_label[i] else ''
        print('f_{j}(x_0): {l:8.3f} <= f_{j}(x_0+delta) <= {u:8.3f} {ind}'.format(
            j=j, l=lb[i][j].item(), u=ub[i][j].item(), ind=indicator))
print()

print('Demonbstration 1.2: same-slope option is also available with CROWN-Optimized')
print('Bounding method:', 'CROWN-Optimized (alpha-CROWN)')
print('Bounding option:', 'Default (adaptive)')
lb, ub = lirpa_model_default.compute_bounds(x=(image,), method='CROWN-Optimized')
for i in range(N):
    print(f'Image {i} top-1 prediction {label[i]} ground-truth {true_label[i]}')
    for j in range(n_classes):
        indicator = '(ground-truth)' if j == true_label[i] else ''
        print('f_{j}(x_0): {l:8.3f} <= f_{j}(x_0+delta) <= {u:8.3f} {ind}'.format(
            j=j, l=lb[i][j].item(), u=ub[i][j].item(), ind=indicator))
print()

print('Bounding option:', 'same-slope')
lb, ub = lirpa_model_sameslope.compute_bounds(x=(image,), method='CROWN-Optimized')
for i in range(N):
    print(f'Image {i} top-1 prediction {label[i]} ground-truth {true_label[i]}')
    for j in range(n_classes):
        indicator = '(ground-truth)' if j == true_label[i] else ''
        print('f_{j}(x_0): {l:8.3f} <= f_{j}(x_0+delta) <= {u:8.3f} {ind}'.format(
            j=j, l=lb[i][j].item(), u=ub[i][j].item(), ind=indicator))
print()


print('Demonstration 2: Obtaining linear coefficients of the lower and upper bounds.')
print('With same-slope option, two linear coefficients should be the same.')
# There are many bound coefficients during CROWN bound calculation; here we are interested in the linear bounds
# of the output layer, with respect to the input layer (the image).
required_A = defaultdict(set)
required_A[lirpa_model_sameslope.output_name[0]].add(lirpa_model_sameslope.input_name[0])

print("Bounding method:", 'backward')
print("Bounding option:", 'same-slope')
lb, ub, A_dict = lirpa_model_sameslope.compute_bounds(x=(image,), method='backward', return_A=True, needed_A_dict=required_A)
lower_A, lower_bias = A_dict[lirpa_model_sameslope.output_name[0]][lirpa_model_sameslope.input_name[0]]['lA'], A_dict[lirpa_model_sameslope.output_name[0]][lirpa_model_sameslope.input_name[0]]['lbias']
upper_A, upper_bias = A_dict[lirpa_model_sameslope.output_name[0]][lirpa_model_sameslope.input_name[0]]['uA'], A_dict[lirpa_model_sameslope.output_name[0]][lirpa_model_sameslope.input_name[0]]['ubias']
print(f'lower bound linear coefficients size (batch, output_dim, *input_dims): {list(lower_A.size())}')
print(f'lower bound bias term size (batch, output_dim): {list(lower_bias.size())}')
print(f'upper bound linear coefficients size (batch, output_dim, *input_dims): {list(upper_A.size())}')
print(f'upper bound bias term size (batch, output_dim): {list(upper_bias.size())}')
print()
print(f'lower bound linear coefficients should be the same as upper bound linear coefficients: {(lower_A - upper_A).abs().max() < 1e-5}')
print()


================================================
FILE: examples/vision/cifar_training.py
================================================
import argparse
import multiprocessing
import random
import time
import logging
import os

import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.nn import CrossEntropyLoss

import models
from auto_LiRPA import BoundedModule, BoundedTensor, BoundDataParallel, CrossEntropyWrapper
from auto_LiRPA.bound_ops import BoundExp
from auto_LiRPA.eps_scheduler import LinearScheduler, SmoothedScheduler, AdaptiveScheduler, FixedScheduler
from auto_LiRPA.perturbations import *
from auto_LiRPA.utils import MultiAverageMeter, logger, get_spec_matrix, sync_params

def get_exp_module(bounded_module):
    for _, node in bounded_module.named_modules():
        # Find the Exp neuron in computational graph
        if isinstance(node, BoundExp):
            return node
    return None

parser = argparse.ArgumentParser()

parser.add_argument("--verify", action="store_true", help='verification mode, do not train')
parser.add_argument("--no_loss_fusion", action="store_true", help='without loss fusion, slower training mode')
parser.add_argument("--load", type=str, default="", help='Load pretrained model')
parser.add_argument("--device", type=str, default="cuda", choices=["cpu", "cuda"], help='use cpu or cuda')
parser.add_argument("--data", type=str, default="CIFAR", choices=["MNIST", "CIFAR"], help='dataset')
parser.add_argument("--seed", type=int, default=100, help='random seed')
parser.add_argument("--eps", type=float, default=8.8/255, help='Target training epsilon')
parser.add_argument("--norm", type=float, default='inf', help='p norm for epsilon perturbation')
parser.add_argument("--bound_type", type=str, default="CROWN-IBP",
                    choices=["IBP", "CROWN-IBP", "CROWN"], help='method of bound analysis')
parser.add_argument("--model", type=str, default="cnn_7layer_bn",
                    help='model name (Densenet_cifar_32, resnet18, ResNeXt_cifar, MobileNet_cifar, wide_resnet_cifar_bn_wo_pooling)')
parser.add_argument("--num_epochs", type=int, default=2000, help='number of total epochs')
parser.add_argument("--batch_size", type=int, default=256, help='batch size')
parser.add_argument("--lr", type=float, default=5e-4, help='learning rate')
parser.add_argument("--lr_decay_rate", type=float, default=0.1, help='learning rate decay rate')
parser.add_argument("--lr_decay_milestones", nargs='+', type=int, default=[1400, 1700], help='learning rate dacay milestones')
parser.add_argument("--scheduler_name", type=str, default="SmoothedScheduler",
                    choices=["LinearScheduler", "SmoothedScheduler"], help='epsilon scheduler')
parser.add_argument("--scheduler_opts", type=str, default="start=101,length=801,mid=0.4", help='options for epsilon scheduler')
parser.add_argument("--bound_opts", type=str, default=None, choices=["same-slope", "zero-lb", "one-lb"],
                    help='bound options for relu')
parser.add_argument('--clip_grad_norm', type=float, default=8.0)

args = parser.parse_args()
exp_name = args.model + '_b' + str(args.batch_size) + '_' + str(args.bound_type) + '_epoch' + str(args.num_epochs) + '_' + args.scheduler_opts + '_' + str(args.eps)[:6]
os.makedirs('saved_models/', exist_ok=True)
log_file = f'saved_models/{exp_name}{"_test" if args.verify else ""}.log'
file_handler = logging.FileHandler(log_file)
logger.addHandler(file_handler)

def Train(model, t, loader, eps_scheduler, norm, train, opt, bound_type, method='robust', loss_fusion=True, final_node_name=None):
    num_class = 10
    meter = MultiAverageMeter()
    if train:
        model.train()
        eps_scheduler.train()
        eps_scheduler.step_epoch()
        eps_scheduler.set_epoch_length(int((len(loader.dataset) + loader.batch_size - 1) / loader.batch_size))
    else:
        model.eval()
        eps_scheduler.eval()

    exp_module = get_exp_module(model)

    def get_bound_loss(x=None, c=None):
        if loss_fusion:
            bound_lower, bound_upper = False, True
        else:
            bound_lower, bound_upper = True, False

        if bound_type == 'IBP':
            lb, ub = model(method_opt="compute_bounds", x=x, IBP=True, C=c, method=None, final_node_name=final_node_name, no_replicas=True)
        elif bound_type == 'CROWN':
            lb, ub = model(method_opt="compute_bounds", x=x, IBP=False, C=c, method='backward',
                                          bound_lower=bound_lower, bound_upper=bound_upper)
        elif bound_type == 'CROWN-IBP':
            # lb, ub = model.compute_bounds(ptb=ptb, IBP=True, x=data, C=c, method='backward')  # pure IBP bound
            # we use a mixed IBP and CROWN-IBP bounds, leading to better performance (Zhang et al., ICLR 2020)
            factor = (eps_scheduler.get_max_eps() - eps_scheduler.get_eps()) / eps_scheduler.get_max_eps()
            ilb, iub = model(method_opt="compute_bounds", x=x, IBP=True, C=c, method=None, final_node_name=final_node_name, no_replicas=True)
            if factor < 1e-50:
                lb, ub = ilb, iub
            else:
                clb, cub = model(method_opt="compute_bounds", IBP=False, C=c, method='backward',
                             bound_lower=bound_lower, bound_upper=bound_upper, final_node_name=final_node_name, no_replicas=True)
                if loss_fusion:
                    ub = cub * factor + iub * (1 - factor)
                else:
                    lb = clb * factor + ilb * (1 - factor)

        if loss_fusion:
            if isinstance(model, BoundDataParallel):
                max_input = model(get_property=True, node_class=BoundExp, att_name='max_input')
            else:
                max_input = exp_module.max_input
            return None, torch.mean(torch.log(ub) + max_input)
        else:
            # Pad zero at the beginning for each example, and use fake label '0' for all examples
            lb_padded = torch.cat((torch.zeros(size=(lb.size(0), 1), dtype=lb.dtype, device=lb.device), lb), dim=1)
            fake_labels = torch.zeros(size=(lb.size(0),), dtype=torch.int64, device=lb.device)
            robust_ce = CrossEntropyLoss()(-lb_padded, fake_labels)
            return lb, robust_ce

    for i, (data, labels) in enumerate(loader):
        start = time.time()
        eps_scheduler.step_batch()
        eps = eps_scheduler.get_eps()
        # For small eps just use natural training, no need to compute LiRPA bounds
        batch_method = method
        if eps < 1e-50:
            batch_method = "natural"
        if train:
            opt.zero_grad()
        # bound input for Linf norm used only
        if norm == np.inf:
            data_max = torch.reshape((1. - loader.mean) / loader.std, (1, -1, 1, 1))
            data_min = torch.reshape((0. - loader.mean) / loader.std, (1, -1, 1, 1))
            data_ub = torch.min(data + (eps / loader.std).view(1,-1,1,1), data_max)
            data_lb = torch.max(data - (eps / loader.std).view(1,-1,1,1), data_min)
        else:
            data_ub = data_lb = data

        if list(model.parameters())[0].is_cuda:
            data, labels = data.cuda(), labels.cuda()
            data_lb, data_ub = data_lb.cuda(), data_ub.cuda()

        ptb = PerturbationLpNorm(norm=norm, eps=eps, x_L=data_lb, x_U=data_ub)
        x = BoundedTensor(data, ptb)
        if loss_fusion:
            if batch_method == 'natural' or not train:
                output = model(x, labels)  # , disable_multi_gpu=True
                regular_ce = torch.mean(torch.log(output))
            else:
                model(x, labels)
                regular_ce = torch.tensor(0., device=data.device)
            meter.update('CE', regular_ce.item(), x.size(0))
            x = (x, labels)
            c = None
        else:
            # Generate speicification matrix (when loss fusion is not used).
            c = get_spec_matrix(data, labels, num_class)
            x = (x,) if final_node_name is None else (x, labels)
            output = model(x, final_node_name=final_node_name)
            regular_ce = CrossEntropyLoss()(output, labels)  # regular CrossEntropyLoss used for warming up
            meter.update('CE', regular_ce.item(), x[0].size(0))
            meter.update('Err', torch.sum(torch.argmax(output, dim=1) != labels).item() / x[0].size(0), x[0].size(0))

        if batch_method == 'robust':
            lb, robust_ce = get_bound_loss(x=x, c=c)
            loss = robust_ce
        elif batch_method == 'natural':
            loss = regular_ce

        if train:
            loss.backward()

            if args.clip_grad_norm:
                grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=args.clip_grad_norm)
                meter.update('grad_norm', grad_norm)

            if isinstance(eps_scheduler, AdaptiveScheduler):
                eps_scheduler.update_loss(loss.item() - regular_ce.item())
            opt.step()
        meter.update('Loss', loss.item(), data.size(0))

        if batch_method != 'natural':
            meter.update('Robust_CE', robust_ce.item(), data.size(0))
            if not loss_fusion:
                # For an example, if lower bounds of margins is >0 for all classes, the output is verifiably correct.
                # If any margin is < 0 this example is counted as an error
                meter.update('Verified_Err', torch.sum((lb < 0).any(dim=1)).item() / data.size(0), data.size(0))
        meter.update('Time', time.time() - start)

        if (i + 1) % 50 == 0 and train:
            logger.info('[{:2d}:{:4d}]: eps={:.12f} {}'.format(t, i + 1, eps, meter))

    logger.info('[{:2d}:{:4d}]: eps={:.12f} {}'.format(t, i + 1, eps, meter))
    return meter


def main(args):
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    random.seed(args.seed)
    np.random.seed(args.seed)

    ## Step 1: Initial original model as usual, see model details in models/example_feedforward.py and models/example_resnet.py
    if args.data == 'MNIST':
        model_ori = models.Models[args.model](in_ch=1, in_dim=28)
    else:
        model_ori = models.Models[args.model](in_ch=3, in_dim=32)
    epoch = 0
    if args.load:
        checkpoint = torch.load(args.load)
        epoch, state_dict = checkpoint['epoch'], checkpoint['state_dict']
        opt_state = None
        try:
            opt_state = checkpoint['optimizer']
        except KeyError:
            print('no opt_state found')
        for k, v in state_dict.items():
            assert torch.isnan(v).any().cpu().numpy() == 0 and torch.isinf(v).any().cpu().numpy() == 0
        model_ori.load_state_dict(state_dict)
        logger.info('Checkpoint loaded: {}'.format(args.load))

    ## Step 2: Prepare dataset as usual
    if args.data == 'MNIST':
        dummy_input = torch.randn(2, 1, 28, 28)
        train_data = datasets.MNIST("./data", train=True, download=True, transform=transforms.ToTensor())
        test_data = datasets.MNIST("./data", train=False, download=True, transform=transforms.ToTensor())
    elif args.data == 'CIFAR':
        dummy_input = torch.randn(2, 3, 32, 32)
        normalize = transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010])
        train_data = datasets.CIFAR10("./data", train=True, download=True,
                transform=transforms.Compose([
                    transforms.RandomHorizontalFlip(),
                    transforms.RandomCrop(32, 4, padding_mode='edge'),
                    transforms.ToTensor(),
                    normalize]))
        test_data = datasets.CIFAR10("./data", train=False, download=True,
                transform=transforms.Compose([transforms.ToTensor(), normalize]))

    train_data = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=min(multiprocessing.cpu_count(),4))
    test_data = torch.utils.data.DataLoader(test_data, batch_size=args.batch_size//2, pin_memory=True, num_workers=min(multiprocessing.cpu_count(),4))
    if args.data == 'MNIST':
        train_data.mean = test_data.mean = torch.tensor([0.0])
        train_data.std = test_data.std = torch.tensor([1.0])
    elif args.data == 'CIFAR':
        train_data.mean = test_data.mean = torch.tensor([0.4914, 0.4822, 0.4465])
        train_data.std = test_data.std = torch.tensor([0.2023, 0.1994, 0.2010])

    ## Step 3: wrap model with auto_LiRPA
    # The second parameter dummy_input is for constructing the trace of the computational graph.
    model = BoundedModule(model_ori, dummy_input, bound_opts={'activation_bound_option':args.bound_opts}, device=args.device)
    final_name1 = model.final_name
    model_loss = BoundedModule(CrossEntropyWrapper(model_ori), (dummy_input, torch.zeros(1, dtype=torch.long)),
                               bound_opts={'activation_bound_option': args.bound_opts, 'loss_fusion': True}, device=args.device)
    # after CrossEntropyWrapper, the final name will change because of one additional input node in CrossEntropyWrapper
    final_name2 = model_loss._modules[final_name1].output_name[0]
    assert type(model._modules[final_name1]) == type(model_loss._modules[final_name2])
    if args.no_loss_fusion:
        model_loss = BoundedModule(model_ori, dummy_input, bound_opts={'activation_bound_option':args.bound_opts}, device=args.device)
        final_name2 = None
    model_loss = BoundDataParallel(model_loss)

    ## Step 4 prepare optimizer, epsilon scheduler and learning rate scheduler
    opt = optim.Adam(model_loss.parameters(), lr=args.lr)
    norm = float(args.norm)
    lr_scheduler = optim.lr_scheduler.MultiStepLR(opt, milestones=args.lr_decay_milestones, gamma=args.lr_decay_rate)
    eps_scheduler = eval(args.scheduler_name)(args.eps, args.scheduler_opts)
    logger.info(str(model_ori))

    # skip epochs
    if epoch > 0:
        epoch_length = int((len(train_data.dataset) + train_data.batch_size - 1) / train_data.batch_size)
        eps_scheduler.set_epoch_length(epoch_length)
        eps_scheduler.train()
        for i in range(epoch):
            lr_scheduler.step()
            eps_scheduler.step_epoch(verbose=True)
            for j in range(epoch_length):
                eps_scheduler.step_batch()
        logger.info('resume from eps={:.12f}'.format(eps_scheduler.get_eps()))

    if args.load:
        if opt_state:
            opt.load_state_dict(opt_state)
            logger.info('resume opt_state')

    ## Step 5: start training
    if args.verify:
        eps_scheduler = FixedScheduler(args.eps)
        with torch.no_grad():
            Train(model, 1, test_data, eps_scheduler, norm, False, None, 'IBP', loss_fusion=False, final_node_name=None)
    else:
        timer = 0.0
        best_err = 1e10
        for t in range(epoch + 1, args.num_epochs+1):
            logger.info("Epoch {}, learning rate {}".format(t, lr_scheduler.get_last_lr()))
            start_time = time.time()
            Train(model_loss, t, train_data, eps_scheduler, norm, True, opt, args.bound_type, loss_fusion=not args.no_loss_fusion)
            lr_scheduler.step()
            epoch_time = time.time() - start_time
            timer += epoch_time
            logger.info('Epoch time: {:.4f}, Total time: {:.4f}'.format(epoch_time, timer))

            logger.info("Evaluating...")
            torch.cuda.empty_cache()

            state_dict = sync_params(model_ori, model_loss, loss_fusion=True)

            with torch.no_grad():
                if t > int(eps_scheduler.params['start']) + int(eps_scheduler.params['length']):
                    m = Train(model_loss, t, test_data, FixedScheduler(8./255), norm, False, None, 'IBP', loss_fusion=False,
                              final_node_name=final_name2)
                else:
                    m = Train(model_loss, t, test_data, eps_scheduler, norm, False, None, 'IBP', loss_fusion=False, final_node_name=final_name2)

            save_dict = {'state_dict': state_dict, 'epoch': t, 'optimizer': opt.state_dict()}
            if t < int(eps_scheduler.params['start']):
                torch.save(save_dict, 'saved_models/natural_' + exp_name)
            elif t > int(eps_scheduler.params['start']) + int(eps_scheduler.params['length']):
                current_err = m.avg('Verified_Err')
                if current_err < best_err:
                    best_err = current_err
                    torch.save(save_dict, 'saved_models/' + exp_name + '_best_' + str(best_err)[:6])
                else:
                    torch.save(save_dict, 'saved_models/' + exp_name)
            else:
                torch.save(save_dict, 'saved_models/' + exp_name)
            torch.cuda.empty_cache()


if __name__ == "__main__":
    logger.info(args)
    main(args)


================================================
FILE: examples/vision/custom_op.py
================================================
""" A example for custom operators.

In this example, we create a custom operator called "PlusConstant", which can
be written as "f(x) = x + c" for some constant "c" (an attribute of the operator).
"""
import torch
import torch.nn as nn
import torchvision
from auto_LiRPA import BoundedModule, BoundedTensor, register_custom_op
from auto_LiRPA.operators import Bound
from auto_LiRPA.perturbations import PerturbationLpNorm
from auto_LiRPA.utils import Flatten

""" Step 1: Define a `torch.autograd.Function` class to declare and implement the
computation of the operator. """
class PlusConstantOp(torch.autograd.Function):
    @staticmethod
    def symbolic(g, x, const):
        """ In this function, define the arguments and attributes of the operator.
        "custom::PlusConstant" is the name of the new operator, "x" is an argument
        of the operator, "const_i" is an attribute which stands for "c" in the operator.
        There can be multiple arguments and attributes. For attribute naming,
        use a suffix such as "_i" to specify the data type, where "_i" stands for
        integer, "_t" stands for tensor, "_f" stands for float, etc. """
        return g.op('custom::PlusConstant', x, const_i=const)

    @staticmethod
    def forward(ctx, x, const):
        """ In this function, implement the computation for the operator, i.e.,
        f(x) = x + c in this case. """
        return x + const

""" Step 2: Define a `torch.nn.Module` class to declare a module using the defined
custom operator. """
class PlusConstant(nn.Module):
    def __init__(self, const=1):
        super().__init__()
        self.const = const

    def forward(self, x):
        """ Use `PlusConstantOp.apply` to call the defined custom operator. """
        return PlusConstantOp.apply(x, self.const)

""" Step 3: Implement a Bound class to support bound computation for the new operator. """
class BoundPlusConstant(Bound):
    def __init__(self, attr, inputs, output_index, options):
        """ `const` is an attribute and can be obtained from the dict `attr` """
        super().__init__(attr, inputs, output_index, options)
        self.const = attr['const']

    def forward(self, x):
        return x + self.const

    def bound_backward(self, last_lA, last_uA, x, *args, **kwargs):
        """ Backward mode bound propagation """
        print('Calling bound_backward for custom::PlusConstant')
        def _bound_oneside(last_A):
            # If last_lA or last_uA is None, it means lower or upper bound
            # is not required, so we simply return None.
            if last_A is None:
                return None, 0
            # The function f(x) = x + c is a linear function with coefficient 1.
            # Then A · f(x) = A · (x + c) = A · x + A · c.
            # Thus the new A matrix is the same as the last A matrix:
            A = last_A
            # For bias, compute A · c and reduce the dimensions by sum:
            bias = last_A.sum(dim=list(range(2, last_A.ndim))) * self.const
            return A, bias
        lA, lbias = _bound_oneside(last_lA)
        uA, ubias = _bound_oneside(last_lA)
        return [(lA, uA)], lbias, ubias

    def interval_propagate(self, *v):
        """ IBP computation """
        print('Calling interval_propagate for custom::PlusConstant')
        # Interval bound of the input
        h_L, h_U = v[0]
        # Since this function is monotonic, we can get the lower bound and upper bound
        # by applying the function on h_L and h_U respectively.
        lower = h_L + self.const
        upper = h_U + self.const
        return lower, upper

""" Step 4: Register the custom operator """
register_custom_op("custom::PlusConstant", BoundPlusConstant)

# Use the `PlusConstant` module in model definition
model = nn.Sequential(
    Flatten(),
    nn.Linear(28 * 28, 256),
    PlusConstant(const=1),
    nn.Linear(256, 10),
)
print("Model:", model)

test_data = torchvision.datasets.MNIST("./data", train=False, download=True, transform=torchvision.transforms.ToTensor())
N = 1
n_classes = 10
image = test_data.data[:N].view(N,1,28,28)
true_label = test_data.targets[:N]
image = image.to(torch.float32) / 255.0
if torch.cuda.is_available():
    image = image.cuda()
    model = model.cuda()

lirpa_model = BoundedModule(model, torch.empty_like(image), device=image.device)

eps = 0.3
norm = float("inf")
ptb = PerturbationLpNorm(norm = norm, eps = eps)
image = BoundedTensor(image, ptb)
pred = lirpa_model(image)
label = torch.argmax(pred, dim=1).cpu().detach().numpy()

for method in ['IBP', 'IBP+backward (CROWN-IBP)', 'backward (CROWN)']:
    print("Bounding method:", method)
    lb, ub = lirpa_model.compute_bounds(x=(image,), method=method.split()[0])
    for i in range(N):
        print("Image {} top-1 prediction {} ground-truth {}".format(i, label[i], true_label[i]))
        for j in range(n_classes):
            indicator = '(ground-truth)' if j == true_label[i] else ''
            print("f_{j}(x_0): {l:8.3f} <= f_{j}(x_0+delta) <= {u:8.3f} {ind}".format(
                j=j, l=lb[i][j].item(), u=ub[i][j].item(), ind=indicator))
    print()


================================================
FILE: examples/vision/data/.gitignore
================================================
MNIST
cifar*

================================================
FILE: examples/vision/data/ImageNet64/imagenet_data_loader.py
================================================
import os

import numpy as np
from PIL import Image


class DatasetDownsampledImageNet():
    def __init__(self):
        # self.data_path = data_path
        os.mkdir('train')
        os.mkdir('test')
        for i in range(1000):
            os.mkdir('train/' + str(i))
            os.mkdir('test/' + str(i))
            print(i)
        self.load_data('raw_data/Imagenet64_train_npz', count=0, fname='train/')
        self.load_data('raw_data/Imagenet64_val_npz', count=1e8, fname='test/')

    def load_data(self, data_path, img_size=64, count=0., fname=''):
        files = os.listdir(data_path)
        img_size2 = img_size * img_size

        # count = 0  # 1e8  # test data start with 1
        for file in files:
            f = np.load(data_path + '/' + file)
            x = np.array(f['data'])
            y = np.array(f['labels']) - 1
            x = np.dstack((x[:, :img_size2], x[:, img_size2:2 * img_size2], x[:, 2 * img_size2:]))
            x = x.reshape((x.shape[0], img_size, img_size, 3))

            for i, img in enumerate(x):
                img = Image.fromarray(img.reshape(img_size, img_size, 3))
                name = str(int(count)).zfill(9)
                label = str(y[i])
                print(count, fname + label + '/' + name + '_label_' + label.zfill(4) + '.png')
                # img.show()
                img.save(fname + label + '/' + name + '_label_' + label.zfill(4) + '.png')

                count += 1


if __name__ == "__main__":
    DatasetDownsampledImageNet()


================================================
FILE: examples/vision/data/tinyImageNet/.gitignore
================================================
tiny-imagenet-200*


================================================
FILE: examples/vision/data/tinyImageNet/tinyimagenet_download.sh
================================================
#!/bin/bash

# download and unzip dataset
wget http://cs231n.stanford.edu/tiny-imagenet-200.zip
unzip tiny-imagenet-200.zip

current="$(pwd)/tiny-imagenet-200"

# training data
echo "preparing training data..."
cd $current/train
for DIR in $(ls); do
   cd $DIR
   rm *.txt
   mv images/* .
   rm -r images
   cd ..
done

# validation data
echo "preparing validation data..."
cd $current/val
annotate_file="val_annotations.txt"
length=$(cat $annotate_file | wc -l)
for i in $(seq 1 $length); do
    # fetch i th line
    line=$(sed -n ${i}p $annotate_file)
    # get file name and directory name
    file=$(echo $line | cut -f1 -d" " )
    directory=$(echo $line | cut -f2 -d" ")
    mkdir -p $directory
    mv images/$file $directory
done
rm -r images
echo "done"


================================================
FILE: examples/vision/datasets.py
================================================
import multiprocessing
import torch
from torch.utils import data
from functools import partial
import torchvision.transforms as transforms
import torchvision.datasets as datasets

# compute image statistics (by Andreas https://discuss.pytorch.org/t/computing-the-mean-and-std-of-dataset/34949/4)
def get_stats(loader):
    mean = 0.0
    for images, _ in loader:
        batch_samples = images.size(0) 
        reshaped_img = images.view(batch_samples, images.size(1), -1)
        mean += reshaped_img.mean(2).sum(0)
    w = images.size(2)
    h = images.size(3)
    mean = mean / len(loader.dataset)

    var = 0.0
    for images, _ in loader:
        batch_samples = images.size(0)
        images = images.view(batch_samples, images.size(1), -1)
        var += ((images - mean.unsqueeze(1))**2).sum([0,2])
    std = torch.sqrt(var / (len(loader.dataset)*w*h))
    return mean, std

# load MNIST of Fashion-MNIST
def mnist_loaders(dataset, batch_size, shuffle_train = True, shuffle_test = False, ratio=None, test_batch_size=None):
    # Use the AWS mirror and avoid the yann.lecun.com mirror.
    dataset.mirrors = [
        'https://ossci-datasets.s3.amazonaws.com/mnist/',
    ]

    mnist_train = dataset("./data", train=True, download=True, transform=transforms.ToTensor())
    mnist_test = dataset("./data", train=False, download=True, transform=transforms.ToTensor())

    if ratio is not None:
        # only sample in training data
        num_of_each_class_train = int(len(mnist_train) // 10 * ratio)
        # num_of_each_class_test = int(len(mnist_test)//10*ratio)

        class_idx_train = [(mnist_train.targets == _).nonzero().numpy().squeeze() for _ in range(10)]
        # class_idx_test = [(mnist_test.targets==_).nonzero().numpy().squeeze() for _ in range(10)]

        for i in range(len(class_idx_train)):
            class_idx_train[i] = class_idx_train[i][:num_of_each_class_train]
            # class_idx_test[i] = class_idx_test[i][:num_of_each_class_test]

        mnist_train = data.Subset(mnist_train, [y for z in class_idx_train for y in z])

    train_loader = torch.utils.data.DataLoader(mnist_train, batch_size=batch_size, shuffle=shuffle_train, pin_memory=True, num_workers=min(multiprocessing.cpu_count(),2))
    if test_batch_size:
        batch_size = test_batch_size
    test_loader = torch.utils.data.DataLoader(mnist_test, batch_size=batch_size, shuffle=shuffle_test, pin_memory=True, num_workers=min(multiprocessing.cpu_count(),2))
    std = [1.0]
    train_loader.std = std
    test_loader.std = std
    return train_loader, test_loader


def cifar_loaders(batch_size, shuffle_train = True, shuffle_test = False, train_random_transform = False, normalize_input = False, num_examples = None, test_batch_size=None): 
    if normalize_input:
        std = [0.2023, 0.1994, 0.2010]
        normalize = transforms.Normalize(mean = [0.4914, 0.4822, 0.4465],
                                          std = std)
    else:
        std = [1.0, 1.0, 1.0]
        normalize = transforms.Normalize(mean=[0, 0, 0],
                                         std=std)
    if train_random_transform:
        if normalize_input:
            train = datasets.CIFAR10('./data', train=True, download=True, 
                transform=transforms.Compose([
                    transforms.RandomHorizontalFlip(),
                    transforms.RandomCrop(32, 4),
                    transforms.ToTensor(),
                    normalize,
                ]))
        else:
            train = datasets.CIFAR10('./data', train=True, download=True, 
                transform=transforms.Compose([
                    transforms.RandomHorizontalFlip(),
                    transforms.RandomCrop(32, 4),
                    transforms.ToTensor(),
                ]))
    else:
        train = datasets.CIFAR10('./data', train=True, download=True, 
            transform=transforms.Compose([transforms.ToTensor(),normalize]))
    test = datasets.CIFAR10('./data', train=False, 
        transform=transforms.Compose([transforms.ToTensor(), normalize]))
    
    if num_examples:
        indices = list(range(num_examples))
        train = data.Subset(train, indices)
        test = data.Subset(test, indices)

    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size,
        shuffle=shuffle_train, pin_memory=True, num_workers=min(multiprocessing.cpu_count(),6))
    if test_batch_size:
        batch_size = test_batch_size
    test_loader = torch.utils.data.DataLoader(test, batch_size=max(batch_size, 1),
        shuffle=shuffle_test, pin_memory=True, num_workers=min(multiprocessing.cpu_count(),6))
    train_loader.std = std
    test_loader.std = std
    return train_loader, test_loader

def svhn_loaders(batch_size, shuffle_train = True, shuffle_test = False, train_random_transform = False, normalize_input = False, num_examples = None, test_batch_size=None): 
    if normalize_input:
        mean = [0.43768206, 0.44376972, 0.47280434] 
        std = [0.19803014, 0.20101564, 0.19703615]
        normalize = transforms.Normalize(mean = mean,
                                          std = std)
    else:
        std = [1.0, 1.0, 1.0]
        normalize = transforms.Normalize(mean=[0, 0, 0],
                                         std=std)
    if train_random_transform:
        if normalize_input:
            train = datasets.SVHN('./data', split='train', download=True, 
                transform=transforms.Compose([
                    transforms.RandomCrop(32, 4),
                    transforms.ToTensor(),
                    normalize,
                ]))
        else:
            train = datasets.SVHN('./data', split='train', download=True, 
                transform=transforms.Compose([
                    transforms.RandomCrop(32, 4),
                    transforms.ToTensor(),
                ]))
    else:
        train = datasets.SVHN('./data', split='train', download=True, 
            transform=transforms.Compose([transforms.ToTensor(),normalize]))
    test = datasets.SVHN('./data', split='test', download=True,
        transform=transforms.Compose([transforms.ToTensor(), normalize]))
    
    if num_examples:
        indices = list(range(num_examples))
        train = data.Subset(train, indices)
        test = data.Subset(test, indices)

    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size,
        shuffle=shuffle_train, pin_memory=True, num_workers=min(multiprocessing.cpu_count(),6))
    if test_batch_size:
        batch_size = test_batch_size
    test_loader = torch.utils.data.DataLoader(test, batch_size=max(batch_size, 1),
        shuffle=shuffle_test, pin_memory=True, num_workers=min(multiprocessing.cpu_count(),6))
    train_loader.std = std
    test_loader.std = std
    mean, std = get_stats(train_loader)
    print('dataset mean = ', mean.numpy(), 'std = ', std.numpy())
    return train_loader, test_loader

def load_data(data, batch_size):
    if data == 'MNIST':
        dummy_input = torch.randn(1, 1, 28, 28)
        train_data = datasets.MNIST('./data', train=True, download=True, transform=transforms.ToTensor())
        test_data = datasets.MNIST('./data', train=False, download=True, transform=transforms.ToTensor())
    elif data == 'CIFAR':
        dummy_input = torch.randn(1, 3, 32, 32)
        normalize = transforms.Normalize(mean = [0.4914, 0.4822, 0.4465], std = [0.2023, 0.1994, 0.2010])
        train_data = datasets.CIFAR10('./data', train=True, download=True,
                transform=transforms.Compose([
                    transforms.RandomHorizontalFlip(),
                    transforms.RandomCrop(32, 4, padding_mode='edge'),
                    transforms.ToTensor(),
                    normalize]))
        test_data = datasets.CIFAR10('./data', train=False, download=True, 
                transform=transforms.Compose([transforms.ToTensor(), normalize]))

    train_data = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=min(multiprocessing.cpu_count(),4))
    test_data = torch.utils.data.DataLoader(test_data, batch_size=batch_size, pin_memory=True, num_workers=min(multiprocessing.cpu_count(),4))
    if data == 'MNIST':
        train_data.mean = test_data.mean = torch.tensor([0.0])
        train_data.std = test_data.std = torch.tensor([1.0])
    elif data == 'CIFAR':
        train_data.mean = test_data.mean = torch.tensor([0.4914, 0.4822, 0.4465])
        train_data.std = test_data.std = torch.tensor([0.2023, 0.1994, 0.2010])

    return dummy_input, train_data, test_data

# when new loaders is added, they must be registered here
loaders = {
        "MNIST": partial(mnist_loaders, datasets.MNIST),
        "FashionMNIST": partial(mnist_loaders, datasets.FashionMNIST),
        "CIFAR": cifar_loaders,
        "svhn": svhn_loaders,
        }


================================================
FILE: examples/vision/efficient_convolution.py
================================================
"""
Demonstration of efficient convolutional network implementation in auto_LiRPA.

auto_LiRPA library supports an efficient algorithm for computing bounds for
convolutional networks. The "patches" mode implementation makes full backward
bounds (CROWN) for convolutional layers significantly faster by using more
efficient GPU operators.  The convolution mode can be set by the "conv_mode"
key in the bound_opts parameter when constructing your BoundeModule object and
the new "patches" mode is enabled by default.  In this example we show the
differences between "patches" mode and the old "matrix" mode in memory
consumption, on a relatively large ResNet network.

"""

import sys
import torch
import random
import numpy as np
import torchvision
from auto_LiRPA import BoundedModule, BoundedTensor
from auto_LiRPA.perturbations import *
import models

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
conv_mode = sys.argv[1] if len(sys.argv) > 1 else 'patches' # conv_mode can be set as 'matrix' or 'patches'

seed = 1234
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)
np.random.seed(seed)

## Step 1: Define the model
# model_ori = models.model_resnet(width=1, mult=4)
# model_ori = models.ResNet18(in_planes=2)
# model_ori = models.vnncomp_resnet2b()
model_ori = models.vnncomp_resnet4b()
model_ori = model_ori.to(device=device)

## Step 2: Prepare dataset as usual.
# test_data = torchvision.datasets.MNIST("./data", train=False, download=True, transform=torchvision.transforms.ToTensor())
normalize = torchvision.transforms.Normalize(mean = [0.4914, 0.4822, 0.4465], std = [0.2023, 0.1994, 0.2010])
test_data = torchvision.datasets.CIFAR10("./data", train=False, download=True, 
                transform=torchvision.transforms.Compose([torchvision.transforms.ToTensor(), normalize]))
# For illustration we only use 1 image from dataset
N = 1
n_classes = 10
image = torch.Tensor(test_data.data[:N]).reshape(N,3,32,32)
# Convert to float between 0. and 1.
image = image.to(torch.float32) / 255.0
if device == 'cuda':
    image = image.cuda()

## Step 3: wrap model with auto_LiRPA.
# The second parameter is for constructing the trace of the computational graph, and its content is not important.
# The new "patches" conv_mode provides an more efficient implementation for convolutional neural networks.
model = BoundedModule(model_ori, image, bound_opts={"conv_mode": conv_mode}, device=device) 

## Step 4: Compute bounds using LiRPA given a perturbation
eps = 0.1
norm = 2
ptb = PerturbationLpNorm(norm = norm, eps = eps)
image = BoundedTensor(image, ptb)
# Get model prediction as usual
pred = model(image)

# Compute bounds
if device == 'cuda':
    torch.cuda.empty_cache()
print('Using {} mode to compute convolution.'.format(conv_mode))
lb, ub = model.compute_bounds(IBP=False, C=None, method='backward')

## Step 5: Final output
# pred = pred.detach().cpu().numpy()
lb = lb.detach().cpu().numpy()
ub = ub.detach().cpu().numpy()
for i in range(N):
    # print("Image {} top-1 prediction {}".format(i, label[i]))
    for j in range(n_classes):
        print("f_{j}(x_0): {l:8.5f} <= f_{j}(x_0+delta) <= {u:8.5f}".format(j=j, l=lb[i][j], u=ub[i][j]))
    print()

# Print the GPU memory usage
print('Memory usage in "{}" mode:'.format(conv_mode))
if device == 'cuda':
    print(torch.cuda.memory_summary())


================================================
FILE: examples/vision/imagenet_training.py
================================================
import random
import time
import argparse
import multiprocessing
import logging
import torch.optim as optim
from torch.nn import CrossEntropyLoss
from auto_LiRPA import BoundedModule, BoundedTensor, BoundDataParallel, CrossEntropyWrapper
from auto_LiRPA.bound_ops import BoundExp
from auto_LiRPA.perturbations import *
from auto_LiRPA.utils import MultiAverageMeter, logger, get_spec_matrix, sync_params
import models
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from auto_LiRPA.eps_scheduler import *

def get_exp_module(bounded_module):
    for _, node in bounded_module.named_modules():
        # Find the Exp neuron in computational graph
        if isinstance(node, BoundExp):
            return node
    return None

parser = argparse.ArgumentParser()

parser.add_argument("--verify", action="store_true", help='verification mode, do not train')
parser.add_argument("--load", type=str, default="", help='Load pretrained model')
parser.add_argument("--device", type=str, default="cuda", choices=["cpu", "cuda"], help='use cpu or cuda')
parser.add_argument("--data_dir", type=str, default="data/ImageNet64",
                    help='dir of dataset')
parser.add_argument("--seed", type=int, default=100, help='random seed')
parser.add_argument("--eps", type=float, default=1. / 255, help='Target training epsilon')
parser.add_argument("--norm", type=float, default='inf', help='p norm for epsilon perturbation')
parser.add_argument("--bound_type", type=str, default="CROWN-IBP",
                    choices=["IBP", "CROWN-IBP", "CROWN"], help='method of bound analysis')
parser.add_argument("--model", type=str, default="wide_resnet_imagenet64_1000class",
                    help='model name (mlp_3layer, cnn_4layer, cnn_6layer, cnn_7layer, resnet)')
parser.add_argument("--num_epochs", type=int, default=240, help='number of total epochs')
parser.add_argument("--batch_size", type=int, default=125, help='batch size')
parser.add_argument("--lr", type=float, default=1e-3, help='learning rate')
parser.add_argument("--lr_decay_milestones", nargs='+', type=int, default=[200, 220], help='learning rate dacay milestones')
parser.add_argument("--scheduler_name", type=str, default="SmoothedScheduler",
                    choices=["LinearScheduler", "AdaptiveScheduler", "SmoothedScheduler"], help='epsilon scheduler')
parser.add_argument("--scheduler_opts", type=str, default="start=100,length=80", help='options for epsilon scheduler')
parser.add_argument("--bound_opts", type=str, default=None, choices=["same-slope", "zero-lb", "one-lb"],
                    help='bound options')
parser.add_argument('--clip_grad_norm', type=float, default=8.0)
parser.add_argument('--in_planes', type=int, default=16)
parser.add_argument('--widen_factor', type=int, default=10)

args = parser.parse_args()

exp_name = args.model + '_b' + str(args.batch_size) + '_' + str(args.bound_type) + '_epoch' + str(
    args.num_epochs) + '_' + args.scheduler_opts + '_' + str(args.eps)[:6]
log_file = f'saved_models/{exp_name}{"_test" if args.verify else ""}.log'
file_handler = logging.FileHandler(log_file)
logger.addHandler(file_handler)

def Train(model, t, loader, eps_scheduler, norm, train, opt, bound_type, method='robust', loss_fusion=True,
          final_node_name=None):
    num_class = 1000
    meter = MultiAverageMeter()
    if train:
        model.train()
        eps_scheduler.train()
        eps_scheduler.step_epoch()
        eps_scheduler.set_epoch_length(int((len(loader.dataset) + loader.batch_size - 1) / loader.batch_size))
    else:
        model.eval()
        eps_scheduler.eval()

    exp_module = get_exp_module(model)

    def get_bound_loss(x=None, c=None):
        if loss_fusion:
            bound_lower, bound_upper = False, True
        else:
            bound_lower, bound_upper = True, False

        if bound_type == 'IBP':
            lb, ub = model(method_opt="compute_bounds", x=x, IBP=True, C=c, method=None,
                           final_node_name=final_node_name, no_replicas=True)
        elif bound_type == 'CROWN':
            lb, ub = model(method_opt="compute_bounds", x=x, IBP=False, C=c, method='backward',
                           bound_lower=bound_lower, bound_upper=bound_upper)
        elif bound_type == 'CROWN-IBP':
            # lb, ub = model.compute_bounds(ptb=ptb, IBP=True, x=data, C=c, method='backward')  # pure IBP bound
            # we use a mixed IBP and CROWN-IBP bounds, leading to better performance (Zhang et al., ICLR 2020)
            factor = (eps_scheduler.get_max_eps() - eps_scheduler.get_eps()) / eps_scheduler.get_max_eps()
            ilb, iub = model(method_opt="compute_bounds", x=x, IBP=True, C=c, method=None,
                             final_node_name=final_node_name, no_replicas=True)
            if factor < 1e-50:
                lb, ub = ilb, iub
            else:
                clb, cub = model(method_opt="compute_bounds", IBP=False, C=c, method='backward',
                                 bound_lower=bound_lower, bound_upper=bound_upper, final_node_name=final_node_name,
                                 no_replicas=True)
                if loss_fusion:
                    ub = cub * factor + iub * (1 - factor)
                else:
                    lb = clb * factor + ilb * (1 - factor)

        if loss_fusion:
            if isinstance(model, BoundDataParallel):
                max_input = model(get_property=True, node_class=BoundExp, att_name='max_input')
            else:
                max_input = exp_module.max_input
            return None, torch.mean(torch.log(ub) + max_input)
        else:
            # Pad zero at the beginning for each example, and use fake label '0' for all examples
            lb_padded = torch.cat((torch.zeros(size=(lb.size(0), 1), dtype=lb.dtype, device=lb.device), lb), dim=1)
            fake_labels = torch.zeros(size=(lb.size(0),), dtype=torch.int64, device=lb.device)
            robust_ce = CrossEntropyLoss()(-lb_padded, fake_labels)
            return lb, robust_ce

    for i, (data, labels) in enumerate(loader):
        start = time.time()
        eps_scheduler.step_batch()
        eps = eps_scheduler.get_eps()
        # For small eps just use natural training, no need to compute LiRPA bounds
        batch_method = method
        if eps < 1e-50:
            batch_method = "natural"
        if train:
            opt.zero_grad()
        # bound input for Linf norm used only
        if norm == np.inf:
            data_max = torch.reshape((1. - loader.mean) / loader.std, (1, -1, 1, 1))
            data_min = torch.reshape((0. - loader.mean) / loader.std, (1, -1, 1, 1))
            data_ub = torch.min(data + (eps / loader.std).view(1, -1, 1, 1), data_max)
            data_lb = torch.max(data - (eps / loader.std).view(1, -1, 1, 1), data_min)
        else:
            data_ub = data_lb = data

        if list(model.parameters())[0].is_cuda:
            data, labels = data.cuda(), labels.cuda()
            data_lb, data_ub = data_lb.cuda(), data_ub.cuda()

        ptb = PerturbationLpNorm(norm=norm, eps=eps, x_L=data_lb, x_U=data_ub)
        x = BoundedTensor(data, ptb)
        if loss_fusion:
            if batch_method == 'natural' or not train:
                output = model(x, labels)
                regular_ce = torch.mean(torch.log(output))
            else:
                model(x, labels)
                regular_ce = torch.tensor(0., device=data.device)
            meter.update('CE', regular_ce.item(), x.size(0))
            x = (x, labels)
            c = None
        else:
            c = get_spec_matrix(data, labels, num_class)
            x = (x, labels)
            output = model(x, final_node_name=final_node_name)
            regular_ce = CrossEntropyLoss()(output, labels)  # regular CrossEntropyLoss used for warming up
            meter.update('CE', regular_ce.item(), x[0].size(0))
            meter.update('Err', torch.sum(torch.argmax(output, dim=1) != labels).item() / x[0].size(0), x[0].size(0))

        if batch_method == 'robust':
            # print(data.sum())
            lb, robust_ce = get_bound_loss(x=x, c=c)
            loss = robust_ce
        elif batch_method == 'natural':
            loss = regular_ce

        if train:
            loss.backward()

            if args.clip_grad_norm:
                grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=args.clip_grad_norm)
                meter.update('grad_norm', grad_norm)

            if isinstance(eps_scheduler, AdaptiveScheduler):
                eps_scheduler.update_loss(loss.item() - regular_ce.item())
            opt.step()
        meter.update('Loss', loss.item(), data.size(0))

        if batch_method != 'natural':
            meter.update('Robust_CE', robust_ce.item(), data.size(0))
            if not loss_fusion:
                # For an example, if lower bounds of margins is >0 for all classes, the output is verifiably correct.
                # If any margin is < 0 this example is counted as an error
                meter.update('Verified_Err', torch.sum((lb < 0).any(dim=1)).item() / data.size(0), data.size(0))
        meter.update('Time', time.time() - start)

        if (i + 1) % 500 == 0 and train:
            logger.info('[{:2d}:{:4d}]: eps={:.12f} {}'.format(t, i + 1, eps, meter))

    logger.info('[{:2d}:{:4d}]: eps={:.12f} {}'.format(t, i + 1, eps, meter))
    return meter


def main(args):
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    random.seed(args.seed)
    np.random.seed(args.seed)

    ## Step 1: Initial original model as usual, see model details in models/example_feedforward.py and models/example_resnet.py
    model_ori = models.Models[args.model](in_planes=args.in_planes, widen_factor=args.widen_factor)
    epoch = 0
    if args.load:
        checkpoint = torch.load(args.load)
        epoch, state_dict, opt_state = checkpoint['epoch'], checkpoint['state_dict'], checkpoint.get('optimizer')
        for k, v in state_dict.items():
            assert torch.isnan(v).any().cpu().numpy() == 0 and torch.isinf(v).any().cpu().numpy() == 0
        model_ori.load_state_dict(state_dict)
        logger.info('Checkpoint loaded: {}'.format(args.load))

    ## Step 2: Prepare dataset as usual
    dummy_input = torch.randn(2, 3, 56, 56)
    normalize = transforms.Normalize(mean=[0.4815, 0.4578, 0.4082], std=[0.2153, 0.2111, 0.2121])
    train_data = datasets.ImageFolder(args.data_dir + '/train',
                                      transform=transforms.Compose([
                                          transforms.RandomHorizontalFlip(),
                                          transforms.RandomCrop(56, padding_mode='edge'),
                                          transforms.ToTensor(),
                                          normalize,
                                      ]))
    test_data = datasets.ImageFolder(args.data_dir + '/test',
                                     transform=transforms.Compose([
                                         # transforms.RandomResizedCrop(64, scale=(0.875, 0.875), ratio=(1., 1.)),
                                         transforms.CenterCrop(56),
                                         transforms.ToTensor(),
                                         normalize]))

    train_data = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, pin_memory=True,
                                             num_workers=min(multiprocessing.cpu_count(), 4))
    test_data = torch.utils.data.DataLoader(test_data, batch_size=args.batch_size // 4, pin_memory=True,
                                            num_workers=min(multiprocessing.cpu_count(), 4))
    train_data.mean = test_data.mean = torch.tensor([0.4815, 0.4578, 0.4082])
    train_data.std = test_data.std = torch.tensor([0.2153, 0.2111, 0.2121])

    ## Step 3: wrap model with auto_LiRPA
    # The second parameter dummy_input is for constructing the trace of the computational graph.
    model = BoundedModule(model_ori, dummy_input, bound_opts={'activation_bound_option':args.bound_opts}, device=args.device)
    model_loss = BoundedModule(CrossEntropyWrapper(model_ori), (dummy_input, torch.zeros(1, dtype=torch.long)),
                               bound_opts= { 'activation_bound_option': args.bound_opts, 'loss_fusion': True }, device=args.device)
    model_loss = BoundDataParallel(model_loss)

    ## Step 4 prepare optimizer, epsilon scheduler and learning rate scheduler
    opt = optim.Adam(model_loss.parameters(), lr=args.lr)
    norm = float(args.norm)
    lr_scheduler = optim.lr_scheduler.MultiStepLR(opt, milestones=args.lr_decay_milestones, gamma=0.1)
    eps_scheduler = eval(args.scheduler_name)(args.eps, args.scheduler_opts)
    logger.info(str(model_ori))

    if args.load:
        if opt_state:
            opt.load_state_dict(opt_state)
            logger.info('resume opt_state')

    # skip epochs
    if epoch > 0:
        epoch_length = int((len(train_data.dataset) + train_data.batch_size - 1) / train_data.batch_size)
        eps_scheduler.set_epoch_length(epoch_length)
        eps_scheduler.train()
        for i in range(epoch):
            lr_scheduler.step()
            eps_scheduler.step_epoch(verbose=True)
            for j in range(epoch_length):
                eps_scheduler.step_batch()
        logger.info('resume from eps={:.12f}'.format(eps_scheduler.get_eps()))

    ## Step 5: start training
    if args.verify:
        eps_scheduler = FixedScheduler(args.eps)
        with torch.no_grad():
            Train(model, 1, test_data, eps_scheduler, norm, False, None, 'IBP', loss_fusion=False, final_node_name=None)
    else:
        timer = 0.0
        best_err = 1e10
        for t in range(epoch + 1, args.num_epochs + 1):
            logger.info("Epoch {}, learning rate {}".format(t, lr_scheduler.get_last_lr()))
            start_time = time.time()
            Train(model_loss, t, train_data, eps_scheduler, norm, True, opt, args.bound_type, loss_fusion=True)
            lr_scheduler.step()
            epoch_time = time.time() - start_time
            timer += epoch_time
            logger.info('Epoch time: {:.4f}, Total time: {:.4f}'.format(epoch_time, timer))

            logger.info("Evaluating...")
            torch.cuda.empty_cache()

            state_dict = sync_params(model_ori, model_loss, loss_fusion=True)

            with torch.no_grad():
                if int(eps_scheduler.params['start']) + int(eps_scheduler.params['length']) > t >= int(
                        eps_scheduler.params['start']):
                    m = Train(model_loss, t, test_data, eps_scheduler, norm, False, None, args.bound_type, loss_fusion=True)
                else:
                    model_ori.load_state_dict(state_dict)
                    model = BoundedModule(model_ori, dummy_input, bound_opts={'activation_bound_option':args.bound_opts}, device=args.device)
                    model = BoundDataParallel(model)
                    m = Train(model, t, test_data, eps_scheduler, norm, False, None, 'IBP', loss_fusion=False)
                    del model

            save_dict = {'state_dict': state_dict, 'epoch': t, 'optimizer': opt.state_dict()}
            if t < int(eps_scheduler.params['start']):
                torch.save(save_dict, 'saved_models/natural_' + exp_name)
            elif t > int(eps_scheduler.params['start']) + int(eps_scheduler.params['length']):
                current_err = m.avg('Verified_Err')
                if current_err < best_err:
                    best_err = current_err
                    torch.save(save_dict, 'saved_models/' + exp_name + '_best_' + str(best_err)[:6])
            else:
                torch.save(save_dict, 'saved_models/' + exp_name)
            torch.cuda.empty_cache()


if __name__ == "__main__":
    logger.info(args)
    main(args)


================================================
FILE: examples/vision/jacobian.py
================================================
"""Examples of computing Jacobian bounds.

We show examples of:
- Computing Jacobian bounds
- Computing Linf local Lipschitz constants
- Computing JVP bounds
"""

import numpy as np
import torch
import torch.nn as nn
from auto_LiRPA import BoundedModule, BoundedTensor
from auto_LiRPA.perturbations import PerturbationLpNorm
from auto_LiRPA.utils import Flatten
from auto_LiRPA.jacobian import JacobianOP, GradNorm


def build_model(in_ch=3, in_dim=32):
    model = nn.Sequential(
        Flatten(),
        nn.Linear(in_ch*in_dim**2, 100),
        nn.ReLU(),
        nn.Linear(100, 200),
        nn.ReLU(),
        nn.Linear(200, 10),
    )
    return model


def example_jacobian(model_ori, x0, bound_opts, device):
    """Example: computing Jacobian bounds."""

    class JacobianWrapper(nn.Module):
        def __init__(self, model):
            super().__init__()
            self.model = model

        def forward(self, x):
            y = self.model(x)
            return JacobianOP.apply(y, x)

    model = BoundedModule(JacobianWrapper(model_ori), x0, bound_opts=bound_opts, device=device)

    def func(x0):
        return model_ori(x0.requires_grad_(True))
    ret_ori = torch.autograd.functional.jacobian(func, x0).squeeze(2)
    ret_new = model(x0)
    assert torch.allclose(ret_ori, ret_new)

    ret = []
    for eps in [0, 1./255, 4./255]:
        x = BoundedTensor(x0, PerturbationLpNorm(norm=np.inf, eps=eps))
        lower, upper = model.compute_jacobian_bounds(x)
        print(f'Gap between upper and lower Jacobian bound for eps={eps:.5f}',
            (upper - lower).max())
        if eps == 0:
            assert torch.allclose(
                ret_new.view(-1),
                lower.sum(dim=0, keepdim=True).view(-1))
            assert torch.allclose(
                ret_new.view(-1),
                upper.sum(dim=0, keepdim=True).view(-1))
        ret.append((lower.detach(), upper.detach()))

    return ret


def example_local_lipschitz(model_ori, x0, bound_opts, device):
    """Example: computing Linf local Lipschitz constant."""

    class LocalLipschitzWrapper(nn.Module):
        def __init__(self, model):
            super().__init__()
            self.model = model
            self.grad_norm = GradNorm(norm=1)

        def forward(self, x, mask):
            y = self.model(x)
            y_selected = y.matmul(mask)
            jacobian = JacobianOP.apply(y_selected, x)
            lipschitz = self.grad_norm(jacobian)
            return lipschitz

    mask = torch.zeros(10, 1, device=device)
    mask[1, 0] = 1
    model = BoundedModule(LocalLipschitzWrapper(model_ori), (BoundedTensor(x0), mask),
                          bound_opts=bound_opts, device=device)

    y = model_ori(x0.requires_grad_(True))
    ret_ori = torch.autograd.grad(y[:, 1].sum(), x0)[0].abs().flatten(1).sum(dim=-1).view(-1)
    ret_new = model(x0, mask).view(-1)
    assert torch.allclose(ret_ori, ret_new)

    ret = []
    for eps in [0, 1./255, 4./255]:
        x = BoundedTensor(x0, PerturbationLpNorm(norm=np.inf, eps=eps))
        lip = []
        for i in range(mask.shape[0]):
            mask.zero_()
            mask[i, 0] = 1
            ub = model.compute_jacobian_bounds((x, mask), bound_lower=False)[1]
            lip.append(ub)
        lip = torch.concat(lip).max()
        print(f'Linf local Lipschitz constant for eps={eps:.5f}: {lip.item()}')
        ret.append(lip.detach())

    return ret


def example_jvp(model_ori, x0, bound_opts, device):
    """Example: computing Jacobian-Vector Product."""

    class JVPWrapper(nn.Module):
        def __init__(self, model):
            super().__init__()
            self.model = model
            self.grad_norm = GradNorm(norm=1)

        def forward(self, x, v):
            y = self.model(x)
            jacobian = JacobianOP.apply(y, x).flatten(2)
            jvp = (jacobian * v.flatten(1).unsqueeze(1)).sum(dim=-1)
            return jvp

    vector = torch.rand_like(x0)
    model = BoundedModule(JVPWrapper(model_ori), (BoundedTensor(x0), vector),
                          bound_opts=bound_opts, device=device)

    def func(x0):
        return model_ori(x0.requires_grad_(True))
    ret_ori = torch.autograd.functional.jvp(func, x0, vector)[-1].view(-1)
    ret_new = model(x0, vector)
    assert torch.allclose(ret_ori, ret_new)

    ret = []
    for eps in [0, 1./255, 4./255]:
        x = BoundedTensor(x0, PerturbationLpNorm(norm=np.inf, eps=eps))
        lb, ub = model.compute_jacobian_bounds((x, vector))
        print(f'JVP lower bound for eps={eps:.5f}: {lb}')
        print(f'JVP upper bound for eps={eps:.5f}: {ub}')
        ret.append((lb, ub))

    return ret


def compute_jacobians(model_ori, x0, bound_opts=None, device='cpu'):
    results = [[] for _ in range(3)]

    model_ori = model_ori.to(device)
    x0 = x0.to(device)
    print('Model:', model_ori)

    results[0] = example_jacobian(model_ori, x0, bound_opts, device)
    results[1] = example_local_lipschitz(model_ori, x0, bound_opts, device)
    results[2] = example_jvp(model_ori, x0, bound_opts, device)

    return results


if __name__ == '__main__':
    torch.manual_seed(0)

    # Create a small model and load pre-trained parameters.
    model_ori = build_model(in_dim=8)
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    x0 = torch.randn(1, 3, 8, 8, device=device)

    compute_jacobians(model_ori, x0, device=device)


================================================
FILE: examples/vision/models/__init__.py
================================================
from models.resnet import model_resnet
from models.feedforward import *
from models.resnext import *
from models.resnext_imagenet64 import *
from models.densenet import *
from models.mobilenet import *
from models.densenet_no_bn import *
from models.densenet_imagenet import *
from models.wide_resnet_imagenet64 import *
from models.wide_resnet_cifar import *
from models.resnet18 import *
from models.vnncomp_resnet import resnet2b as vnncomp_resnet2b, resnet4b as vnncomp_resnet4b


Models = {
    'mlp_2layer': mlp_2layer,
    'mlp_3layer': mlp_3layer,
    'mlp_3layer_weight_perturb': mlp_3layer_weight_perturb,
    'mlp_5layer': mlp_5layer,
    'cnn_4layer': cnn_4layer,
    'cnn_6layer': cnn_6layer,
    'cnn_7layer': cnn_7layer,
    'cnn_7layer_bn': cnn_7layer_bn,
    'cnn_7layer_bn_imagenet': cnn_7layer_bn_imagenet,
    'resnet': model_resnet,
    'resnet18': ResNet18,
    'ResNeXt_cifar': ResNeXt_cifar,
    'ResNeXt_imagenet64': ResNeXt_imagenet64,
    'Densenet_cifar_32': Densenet_cifar_32,
    'Densenet_cifar_wobn': Densenet_cifar_wobn,
    'Densenet_imagenet': Densenet_imagenet,
    'MobileNet_cifar': MobileNetV2,
    'wide_resnet_cifar': wide_resnet_cifar,
    'wide_resnet_cifar_bn': wide_resnet_cifar_bn,
    'wide_resnet_cifar_bn_wo_pooling': wide_resnet_cifar_bn_wo_pooling,
    'wide_resnet_cifar_bn_wo_pooling_dropout': wide_resnet_cifar_bn_wo_pooling_dropout,
    'wide_resnet_imagenet64': wide_resnet_imagenet64,
    'wide_resnet_imagenet64_1000class': wide_resnet_imagenet64_1000class,
    'vnncomp_resnet2b': vnncomp_resnet2b,
    'vnncomp_resnet4b': vnncomp_resnet4b,
}


================================================
FILE: examples/vision/models/densenet.py
================================================
'''DenseNet in PyTorch.
https://github.com/kuangliu/pytorch-cifar
'''


import math
import torch
import torch.nn as nn
import torch.nn.functional as F

class Bottleneck(nn.Module):
    def __init__(self, in_planes, growth_rate):
        super(Bottleneck, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = nn.Conv2d(in_planes, 4*growth_rate, kernel_size=1, bias=True)
        self.bn2 = nn.BatchNorm2d(4*growth_rate)
        self.conv2 = nn.Conv2d(4*growth_rate, growth_rate, kernel_size=3, padding=1, bias=True)

    def forward(self, x):
        out = self.conv1(F.relu(self.bn1(x)))
        out = self.conv2(F.relu(self.bn2(out)))
        # out = self.conv1(F.relu(x))
        # out = self.conv2(F.relu(out))
        out = torch.cat([out,x], 1)
        return out


class Transition(nn.Module):
    def __init__(self, in_planes, out_planes):
        super(Transition, self).__init__()
        self.bn = nn.BatchNorm2d(in_planes)
        self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, bias=True)

    def forward(self, x):
        out = self.conv(F.relu(self.bn(x)))
        out = F.avg_pool2d(out, 2)
        return out


class DenseNet(nn.Module):
    def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_classes=10):
        super(DenseNet, self).__init__()
        self.growth_rate = growth_rate

        num_planes = 2*growth_rate
        self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, padding=1, bias=True)

        self.dense1 = self._make_dense_layers(block, num_planes, nblocks[0])
        num_planes += nblocks[0]*growth_rate
        out_planes = int(math.floor(num_planes*reduction))
        self.trans1 = Transition(num_planes, out_planes)
        num_planes = out_planes

        self.dense2 = self._make_dense_layers(block, num_planes, nblocks[1])
        num_planes += nblocks[1]*growth_rate
        out_planes = int(math.floor(num_planes*reduction))
        self.trans2 = Transition(num_planes, out_planes)
        num_planes = out_planes

        self.dense3 = self._make_dense_layers(block, num_planes, nblocks[2])
        num_planes += nblocks[2]*growth_rate
        # out_planes = int(math.floor(num_planes*reduction))
        # self.trans3 = Transition(num_planes, out_planes)
        # num_planes = out_planes

        # self.dense4 = self._make_dense_layers(block, num_planes, nblocks[3])
        # num_planes += nblocks[3]*growth_rate

        self.bn = nn.BatchNorm2d(num_planes)
        self.linear1 = nn.Linear(14336, 512)
        self.linear2 = nn.Linear(512, num_classes)


    def _make_dense_layers(self, block, in_planes, nblock):
        layers = []
        for i in range(nblock):
            layers.append(block(in_planes, self.growth_rate))
            in_planes += self.growth_rate
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv1(x)
        out = self.trans1(self.dense1(out))
        out = self.trans2(self.dense2(out))
        out = self.dense3(out)
        out = F.relu(self.bn(out))
        out = torch.flatten(out, 1)
        out = F.relu(self.linear1(out))
        out = self.linear2(out)

        return out

def Densenet_cifar_32(in_ch=3, in_dim=32):
    return DenseNet(Bottleneck, [2,4,4], growth_rate=32)

if __name__ == "__main__":
    from thop import profile

    net = Densenet_cifar_32()
    x = torch.randn(1,3,32,32)
    y = net(x)
    print(net)
    macs, params = profile(net, (torch.randn(1, 3, 32, 32),))
    print(macs / 1000000, params / 1000000)  # 6830M, 7M
    print(y)


================================================
FILE: examples/vision/models/densenet_imagenet.py
================================================
'''DenseNet in PyTorch.
https://github.com/kuangliu/pytorch-cifar
'''


import math
import torch
import torch.nn as nn
import torch.nn.functional as F

class Bottleneck(nn.Module):
    def __init__(self, in_planes, growth_rate):
        super(Bottleneck, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = nn.Conv2d(in_planes, 4*growth_rate, kernel_size=1, bias=True)
        self.bn2 = nn.BatchNorm2d(4*growth_rate)
        self.conv2 = nn.Conv2d(4*growth_rate, growth_rate, kernel_size=3, padding=1, bias=True)

    def forward(self, x):
        out = self.conv1(F.relu(self.bn1(x)))
        out = self.conv2(F.relu(self.bn2(out)))
        # out = self.conv1(F.relu(x))
        # out = self.conv2(F.relu(out))
        out = torch.cat([out,x], 1)
        return out


class Transition(nn.Module):
    def __init__(self, in_planes, out_planes):
        super(Transition, self).__init__()
        self.bn = nn.BatchNorm2d(in_planes)
        self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, bias=True)

    def forward(self, x):
        out = self.conv(F.relu(self.bn(x)))
        out = F.avg_pool2d(out, 2)
        return out


class DenseNet(nn.Module):
    def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_classes=200):
        super(DenseNet, self).__init__()
        self.growth_rate = growth_rate

        num_planes = 2*growth_rate
        self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, padding=1, bias=True)

        self.dense1 = self._make_dense_layers(block, num_planes, nblocks[0])
        num_planes += nblocks[0]*growth_rate
        out_planes = int(math.floor(num_planes*reduction))
        self.trans1 = Transition(num_planes, out_planes)
        num_planes = out_planes

        self.dense2 = self._make_dense_layers(block, num_planes, nblocks[1])
        num_planes += nblocks[1]*growth_rate
        out_planes = int(math.floor(num_planes*reduction))
        self.trans2 = Transition(num_planes, out_planes)
        num_planes = out_planes

        self.dense3 = self._make_dense_layers(block, num_planes, nblocks[2])
        num_planes += nblocks[2]*growth_rate
        # out_planes = int(math.floor(num_planes*reduction))
        # self.trans3 = Transition(num_planes, out_planes)
        # num_planes = out_planes

        # self.dense4 = self._make_dense_layers(block, num_planes, nblocks[3])
        # num_planes += nblocks[3]*growth_rate

        self.bn = nn.BatchNorm2d(num_planes)
        self.linear1 = nn.Linear(43904, 512)
        self.linear2 = nn.Linear(512, num_classes)


    def _make_dense_layers(self, block, in_planes, nblock):
        layers = []
        for i in range(nblock):
            layers.append(block(in_planes, self.growth_rate))
            in_planes += self.growth_rate
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv1(x)
        out = self.trans1(self.dense1(out))
        out = self.trans2(self.dense2(out))
        out = self.dense3(out)
        out = F.relu(self.bn(out))
        out = torch.flatten(out, 1)
        out = F.relu(self.linear1(out))
        out = self.linear2(out)

        return out


def Densenet_imagenet(in_ch=3, in_dim=56):
    return DenseNet(Bottleneck, [2,4,4], growth_rate=32)

if __name__ == "__main__":
    from thop import profile

    net = Densenet_imagenet()
    x = torch.randn(1,3,56,56)
    y = net(x)
    print(net)
    macs, params = profile(net, (torch.randn(1, 3, 56, 56),))
    print(macs / 1000000, params / 1000000)  # 564M, 11M
    print(y.shape)


================================================
FILE: examples/vision/models/densenet_no_bn.py
================================================
'''DenseNet in PyTorch.
https://github.com/kuangliu/pytorch-cifar
'''


import math
import torch
import torch.nn as nn
import torch.nn.functional as F


class Bottleneck(nn.Module):
    def __init__(self, in_planes, growth_rate):
        super(Bottleneck, self).__init__()
        # self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = nn.Conv2d(in_planes, 4*growth_rate, kernel_size=1, bias=True)
        # self.bn2 = nn.BatchNorm2d(4*growth_rate)
        self.conv2 = nn.Conv2d(4*growth_rate, growth_rate, kernel_size=3, padding=1, bias=True)

    def forward(self, x):
        # out = self.conv1(F.relu(self.bn1(x)))
        # out = self.conv2(F.relu(self.bn2(out)))
        out = self.conv1(F.relu(x))
        out = self.conv2(F.relu(out))
        out = torch.cat([out,x], 1)
        return out


class Transition(nn.Module):
    def __init__(self, in_planes, out_planes):
        super(Transition, self).__init__()
        # self.bn = nn.BatchNorm2d(in_planes)
        self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, bias=True)

    def forward(self, x):
        out = self.conv(F.relu(x))
        out = F.avg_pool2d(out, 2)
        return out


class DenseNet(nn.Module):
    def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_classes=10):
        super(DenseNet, self).__init__()
        self.growth_rate = growth_rate

        num_planes = 2*growth_rate
        self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, padding=1, bias=True)

        self.dense1 = self._make_dense_layers(block, num_planes, nblocks[0])
        num_planes += nblocks[0]*growth_rate
        out_planes = int(math.floor(num_planes*reduction))
        self.trans1 = Transition(num_planes, out_planes)
        num_planes = out_planes

        self.dense2 = self._make_dense_layers(block, num_planes, nblocks[1])
        num_planes += nblocks[1]*growth_rate
        out_planes = int(math.floor(num_planes*reduction))
        self.trans2 = Transition(num_planes, out_planes)
        num_planes = out_planes

        self.dense3 = self._make_dense_layers(block, num_planes, nblocks[2])
        num_planes += nblocks[2]*growth_rate
        # out_planes = int(math.floor(num_planes*reduction))
        # self.trans3 = Transition(num_planes, out_planes)
        # num_planes = out_planes

        # self.dense4 = self._make_dense_layers(block, num_planes, nblocks[3])
        # num_planes += nblocks[3]*growth_rate

        # self.bn = nn.BatchNorm2d(num_planes)
        self.linear1 = nn.Linear(9216, 512)
        self.linear2 = nn.Linear(512, num_classes)


    def _make_dense_layers(self, block, in_planes, nblock):
        layers = []
        for i in range(nblock):
            layers.append(block(in_planes, self.growth_rate))
            in_planes += self.growth_rate
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv1(x)
        out = self.trans1(self.dense1(out))
        out = self.trans2(self.dense2(out))
        out = self.dense3(out)
        out = F.relu(out)
        out = torch.flatten(out, 1)
        out = F.relu(self.linear1(out))
        out = self.linear2(out)

        return out

def Densenet_cifar_wobn(in_ch=3, in_dim=56):
    return DenseNet(Bottleneck, [2,4,6], growth_rate=16)


if __name__ == "__main__":
    net = Densenet_cifar_wobn()
    x = torch.randn(1,3,32,32)
    y = net(x)
    print(net)
    print(y)


================================================
FILE: examples/vision/models/feedforward.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from auto_LiRPA import PerturbationLpNorm, BoundedParameter


# CNN, relatively large 4-layer
# parameter in_ch: input image channel, 1 for MNIST and 3 for CIFAR
# parameter in_dim: input dimension, 28 for MNIST and 32 for CIFAR
# parameter width: width multiplier
class cnn_4layer(nn.Module):
    def __init__(self, in_ch, in_dim, width=2, linear_size=256):
        super(cnn_4layer, self).__init__()
        self.conv1 = nn.Conv2d(in_ch, 4 * width, 4, stride=2, padding=1)
        self.conv2 = nn.Conv2d(4 * width, 8 * width, 4, stride=2, padding=1)
        self.fc1 = nn.Linear(8 * width * (in_dim // 4) * (in_dim // 4), linear_size)
        self.fc2 = nn.Linear(linear_size, 10)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)

        return x


class mlp_2layer(nn.Module):
    def __init__(self, in_ch, in_dim, width=1):
        super(mlp_2layer, self).__init__()
        self.fc1 = nn.Linear(in_ch * in_dim * in_dim, 256 * width)
        self.fc2 = nn.Linear(256 * width, 10)

    def forward(self, x):
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


class mlp_3layer(nn.Module):
    def __init__(self, in_ch, in_dim, width=1):
        super(mlp_3layer, self).__init__()
        self.fc1 = nn.Linear(in_ch * in_dim * in_dim, 256 * width)
        self.fc2 = nn.Linear(256 * width, 128 * width)
        self.fc3 = nn.Linear(128 * width, 10)

    def forward(self, x):
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


class mlp_3layer_weight_perturb(nn.Module):
    def __init__(self, in_ch=1, in_dim=28, width=1, pert_weight=True, pert_bias=False, norm=2):
        super(mlp_3layer_weight_perturb, self).__init__()
        self.fc1 = nn.Linear(in_ch * in_dim * in_dim, 64 * width)
        self.fc2 = nn.Linear(64 * width, 64 * width)
        self.fc3 = nn.Linear(64 * width, 10)

        eps = 0.01
        self.ptb = PerturbationLpNorm(norm=norm, eps=eps)

        if pert_weight:
            self.fc1.weight = BoundedParameter(self.fc1.weight.data, self.ptb)
            self.fc2.weight = BoundedParameter(self.fc2.weight.data, self.ptb)
            self.fc3.weight = BoundedParameter(self.fc3.weight.data, self.ptb)

        if pert_bias:
            self.fc1.bias = BoundedParameter(self.fc1.bias.data, self.ptb)
            self.fc2.bias = BoundedParameter(self.fc2.bias.data, self.ptb)
            self.fc3.bias = BoundedParameter(self.fc3.bias.data, self.ptb)

    def forward(self, x):
        x = x.view(-1, 784)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


class mlp_5layer(nn.Module):
    def __init__(self, in_ch, in_dim, width=1):
        super(mlp_5layer, self).__init__()
        self.fc1 = nn.Linear(in_ch * in_dim * in_dim, 256 * width)
        self.fc2 = nn.Linear(256 * width, 256 * width)
        self.fc3 = nn.Linear(256 * width, 256 * width)
        self.fc4 = nn.Linear(256 * width, 128 * width)
        self.fc5 = nn.Linear(128 * width, 10)

    def forward(self, x):
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = self.fc5(x)
        return x


# Model can also be defined as a nn.Sequential
def cnn_7layer(in_ch=3, in_dim=32, width=64, linear_size=512):
    model = nn.Sequential(
        nn.Conv2d(in_ch, width, 3, stride=1, padding=1),
        nn.ReLU(),
        nn.Conv2d(width, width, 3, stride=1, padding=1),
        nn.ReLU(),
        nn.Conv2d(width, 2 * width, 3, stride=2, padding=1),
        nn.ReLU(),
        nn.Conv2d(2 * width, 2 * width, 3, stride=1, padding=1),
        nn.ReLU(),
        nn.Conv2d(2 * width, 2 * width, 3, stride=1, padding=1),
        nn.ReLU(),
        nn.Flatten(),
        nn.Linear((in_dim//2) * (in_dim//2) * 2 * width, linear_size),
        nn.ReLU(),
        nn.Linear(linear_size,10)
    )
    return model

def cnn_7layer_bn(in_ch=3, in_dim=32, width=64, linear_size=512):
    model = nn.Sequential(
        nn.Conv2d(in_ch, width, 3, stride=1, padding=1),
        nn.BatchNorm2d(width),
        nn.ReLU(),
        nn.Conv2d(width, width, 3, stride=1, padding=1),
        nn.BatchNorm2d(width),
        nn.ReLU(),
        nn.Conv2d(width, 2 * width, 3, stride=2, padding=1),
        nn.BatchNorm2d(2 * width),
        nn.ReLU(),
        nn.Conv2d(2 * width, 2 * width, 3, stride=1, padding=1),
        nn.BatchNorm2d(2 * width),
        nn.ReLU(),
        nn.Conv2d(2 * width, 2 * width, 3, stride=1, padding=1),
        nn.BatchNorm2d(2 * width),
        nn.ReLU(),
        nn.Flatten(),
        nn.Linear((in_dim//2) * (in_dim//2) * 2 * width, linear_size),
        nn.ReLU(),
        nn.Linear(linear_size,10)
    )
    return model

def cnn_7layer_bn_imagenet(in_ch=3, in_dim=32, width=64, linear_size=512):
    model = nn.Sequential(
        nn.Conv2d(in_ch, width, 3, stride=1, padding=1),
        nn.BatchNorm2d(width),
        nn.ReLU(),
        nn.Conv2d(width, width, 3, stride=1, padding=1),
        nn.BatchNorm2d(width),
        nn.ReLU(),
        nn.Conv2d(width, 2 * width, 3, stride=2, padding=1),
        nn.BatchNorm2d(2 * width),
        nn.ReLU(),
        nn.Conv2d(2 * width, 2 * width, 3, stride=1, padding=1),
        nn.BatchNorm2d(2 * width),
        nn.ReLU(),
        nn.Conv2d(2 * width, 2 * width, 3, stride=2, padding=1),
        nn.BatchNorm2d(2 * width),
        nn.ReLU(),
        nn.Flatten(),
        nn.Linear(25088, linear_size),
        nn.ReLU(),
        nn.Linear(linear_size,200)
    )
    return model

def cnn_6layer(in_ch, in_dim, width=32, linear_size=256):
    model = nn.Sequential(
        nn.Conv2d(in_ch, width, 3, stride=1, padding=1),
        nn.ReLU(),
        nn.Conv2d(width, width, 3, stride=1, padding=1),
        nn.ReLU(),
        nn.Conv2d(width, 2 * width, 3, stride=2, padding=1),
        nn.ReLU(),
        nn.Conv2d(2 * width, 2 * width, 3, stride=1, padding=1),
        nn.ReLU(),
        nn.Flatten(),
        nn.Linear((in_dim//2) * (in_dim//2) * 2 * width, linear_size),
        nn.ReLU(),
        nn.Linear(linear_size,10)
    )
    return model


================================================
FILE: examples/vision/models/mobilenet.py
================================================
'''MobileNetV2 in PyTorch.

See the paper "Inverted Residuals and Linear Bottlenecks:
Mobile Networks for Classification, Detection and Segmentation" for more details.
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class Block(nn.Module):
    '''expand + depthwise + pointwise'''
    def __init__(self, in_planes, out_planes, expansion, stride):
        super(Block, self).__init__()
        self.stride = stride

        planes = expansion * in_planes
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, stride=1, padding=0, bias=False)
        # self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, groups=planes, bias=False)
        # self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
        # self.bn3 = nn.BatchNorm2d(out_planes)

        self.shortcut = nn.Sequential()
        if stride == 1 and in_planes != out_planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False),
                # nn.BatchNorm2d(out_planes),
            )

    def forward(self, x):
        out = F.relu((self.conv1(x)))
        out = F.relu((self.conv2(out)))
        out = self.conv3(out)
        out = out + self.shortcut(x) if self.stride==1 else out
        return out


class MobileNetV2(nn.Module):
    # (expansion, out_planes, num_blocks, stride)
    cfg = [(1,  16, 1, 1),
           (6,  24, 2, 1),  # NOTE: change stride 2 -> 1 for CIFAR10
           (6,  32, 3, 2),
           (6,  64, 4, 2),
           (6,  96, 3, 1),
           (6, 160, 3, 2),
           (6, 320, 1, 1)]

    def __init__(self, num_classes=10):
        super(MobileNetV2, self).__init__()
        # NOTE: change conv1 stride 2 -> 1 for CIFAR10
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)
        # self.bn1 = nn.BatchNorm2d(32)
        self.layers = self._make_layers(in_planes=32)
        self.conv2 = nn.Conv2d(320, 1280, kernel_size=1, stride=1, padding=0, bias=False)
        # self.bn2 = nn.BatchNorm2d(1280)
        self.linear = nn.Linear(1280, num_classes)

    def _make_layers(self, in_planes):
        layers = []
        for expansion, out_planes, num_blocks, stride in self.cfg:
            strides = [stride] + [1]*(num_blocks-1)
            for stride in strides:
                layers.append(Block(in_planes, out_planes, expansion, stride))
                in_planes = out_planes
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu((self.conv1(x)))
        out = self.layers(out)
        out = F.relu((self.conv2(out)))
        # NOTE: change pooling kernel_size 7 -> 4 for CIFAR10
        out = F.avg_pool2d(out, 4)
        out = torch.flatten(out, 1)
        out = self.linear(out)
        return out


if __name__ == "__main__":
    net = MobileNetV2()
    x = torch.randn(2,3,32,32)
    y = net(x)
    print(y.size())


================================================
FILE: examples/vision/models/resnet.py
================================================
'''
ResNet used in https://arxiv.org/pdf/1805.12514.pdf
https://github.com/locuslab/convex_adversarial/blob/0d11e671ad9318745a2439afce513c82dc6bf5ce/examples/problems.py
'''
import torch
import torch.nn as nn
import math


class Dense(nn.Module):
    def __init__(self, *Ws):
        super(Dense, self).__init__()
        self.Ws = nn.ModuleList(list(Ws))
        if len(Ws) > 0 and hasattr(Ws[0], 'out_features'):
            self.out_features = Ws[0].out_features

    def forward(self, *xs):
        xs = xs[-len(self.Ws):]
        out = sum(W(x) for x, W in zip(xs, self.Ws) if W is not None)
        return out


class DenseSequential(nn.Sequential):
    def forward(self, x):
        xs = [x]
        for module in self._modules.values():
            if 'Dense' in type(module).__name__:
                xs.append(module(*xs))
            else:
                xs.append(module(xs[-1]))
        return xs[-1]


def model_resnet(in_ch=3, in_dim=32, width=1, mult=16, N=1):
    def block(in_filters, out_filters, k, downsample):
        if not downsample:
            k_first = 3
            skip_stride = 1
            k_skip = 1
        else:
            k_first = 4
            skip_stride = 2
            k_skip = 2
        return [
            Dense(nn.Conv2d(in_filters, out_filters, k_first, stride=skip_stride, padding=1)),
            nn.ReLU(),
            Dense(nn.Conv2d(in_filters, out_filters, k_skip, stride=skip_stride, padding=0),
                  None,
                  nn.Conv2d(out_filters, out_filters, k, stride=1, padding=1)),
            nn.ReLU()
        ]

    conv1 = [nn.Conv2d(in_ch, mult, 3, stride=1, padding=3 if in_dim == 28 else 1), nn.ReLU()]
    conv2 = block(mult, mult * width, 3, False)
    for _ in range(N):
        conv2.extend(block(mult * width, mult * width, 3, False))
    conv3 = block(mult * width, mult * 2 * width, 3, True)
    for _ in range(N - 1):
        conv3.extend(block(mult * 2 * width, mult * 2 * width, 3, False))
    conv4 = block(mult * 2 * width, mult * 4 * width, 3, True)
    for _ in range(N - 1):
        conv4.extend(block(mult * 4 * width, mult * 4 * width, 3, False))
    layers = (
            conv1 +
            conv2 +
            conv3 +
            conv4 +
            [nn.Flatten(),
             nn.Linear(mult * 4 * width * 8 * 8, 1000),
             nn.ReLU(),
             nn.Linear(1000, 10)]
    )
    model = DenseSequential(
        *layers
    )

    for m in model.modules():
        if isinstance(m, nn.Conv2d):
            n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
            m.weight.data.normal_(0, math.sqrt(2. / n))
            if m.bias is not None:
                m.bias.data.zero_()
    return model


if __name__ == "__main__":
    model = model_resnet(in_ch=1, in_dim=28)
    dummy = torch.randn(8, 1, 28, 28)
    print(model)
    print(model(dummy).shape)


================================================
FILE: examples/vision/models/resnet18.py
================================================
'''ResNet in PyTorch.

For Pre-activation ResNet, see 'preact_resnet.py'.

Reference:
[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
    Deep Residual Learning for Image Recognition. arXiv:1512.03385
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(
            in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, in_planes, planes, stride=1):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                               stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, self.expansion *
                               planes, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(self.expansion*planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10, in_planes=64):
        super(ResNet, self).__init__()
        self.in_planes = in_planes

        self.conv1 = nn.Conv2d(3, in_planes, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.layer1 = self._make_layer(block, in_planes, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, in_planes * 2, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, in_planes * 4, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, in_planes * 8, num_blocks[3], stride=2)
        self.linear = nn.Linear(in_planes * 8 * block.expansion, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = torch.flatten(out, 1)
        out = self.linear(out)
        return out

def ResNet18(in_planes=64):
    return ResNet(BasicBlock, [2, 2, 2, 2], in_planes=in_planes)

if __name__ == "__main__":
    from thop import profile
    net = ResNet18(in_planes=64)
    x = torch.randn(1,3,32,32)
    y = net(x)
    print(net)
    macs, params = profile(net, (torch.randn(1, 3, 32, 32),))
    print(macs / 1000000, params / 1000000)  # 556M, 11M
    print(y)


================================================
FILE: examples/vision/models/resnext.py
================================================
'''ResNeXt in PyTorch.
See the paper "Aggregated Residual Transformations for Deep Neural Networks" for more details.
https://github.com/kuangliu/pytorch-cifar
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class Block(nn.Module):
    '''Grouped convolution block.'''
    expansion = 2

    def __init__(self, in_planes, cardinality=32, bottleneck_width=4, stride=1):
        super(Block, self).__init__()
        group_width = cardinality * bottleneck_width
        self.conv1 = nn.Conv2d(in_planes, group_width, kernel_size=1, bias=True)
        self.bn1 = nn.BatchNorm2d(group_width)
        self.conv2 = nn.Conv2d(group_width, group_width, kernel_size=3, stride=stride, padding=1, groups=cardinality, bias=True)
        # self.bn2 = nn.BatchNorm2d(group_width)
        self.conv3 = nn.Conv2d(group_width, self.expansion*group_width, kernel_size=1, bias=True)
        # self.bn3 = nn.BatchNorm2d(self.expansion*group_width)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*group_width:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*group_width, kernel_size=1, stride=stride, bias=True),
                # nn.BatchNorm2d(self.expansion*group_width)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        # out = F.relu(self.bn2(self.conv2(out)))
        # out = self.bn3(self.conv3(out))
        # out = F.relu(self.conv1(x))
        out = F.relu(self.conv2(out))
        out = self.conv3(out)
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class ResNeXt(nn.Module):
    def __init__(self, num_blocks, cardinality, bottleneck_width, num_classes=10):
        super(ResNeXt, self).__init__()
        self.cardinality = cardinality
        self.bottleneck_width = bottleneck_width
        self.in_planes = 16

        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, bias=True, padding=1)
        # self.bn1 = nn.BatchNorm2d(16)
        self.layer1 = self._make_layer(num_blocks[0], 1)
        self.layer2 = self._make_layer(num_blocks[1], 2)
        self.layer3 = self._make_layer(num_blocks[2], 2)
        # self.layer4 = self._make_layer(num_blocks[3], 2)
        self.linear1 = nn.Linear(cardinality*bottleneck_width*512, 512)
        self.linear2 = nn.Linear(512, num_classes)


    def _make_layer(self, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(Block(self.in_planes, self.cardinality, self.bottleneck_width, stride))
            self.in_planes = Block.expansion * self.cardinality * self.bottleneck_width
        # Increase bottleneck_width by 2 after each stage.
        self.bottleneck_width *= 2
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.conv1(x))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = torch.flatten(out, 1)
        out = F.relu(self.linear1(out))
        out = self.linear2(out)
        return out


def ResNeXt29_2x64d():
    return ResNeXt(num_blocks=[3,3,3], cardinality=2, bottleneck_width=64)

def ResNeXt29_4x64d():
    return ResNeXt(num_blocks=[3,3,3], cardinality=4, bottleneck_width=64)

def ResNeXt29_8x64d():
    return ResNeXt(num_blocks=[3,3,3], cardinality=8, bottleneck_width=64)

def ResNeXt29_32x4d():
    return ResNeXt(num_blocks=[3,3,3], cardinality=32, bottleneck_width=4)

def ResNeXt_cifar(in_ch=3, in_dim=32):
    return ResNeXt(num_blocks=[1,1,1], cardinality=2, bottleneck_width=32)

if __name__ == "__main__":
    from thop import profile
    net = ResNeXt_cifar()
    x = torch.randn(1,3,32,32)
    y = net(x)
    print(net)
    macs, params = profile(net, (torch.randn(1, 3, 32, 32),))
    print(macs / 1000000, params / 1000000)  # 6830M, 7M
    print(y)

================================================
FILE: examples/vision/models/resnext_imagenet64.py
================================================
'''ResNeXt in PyTorch.
See the paper "Aggregated Residual Transformations for Deep Neural Networks" for more details.
https://github.com/kuangliu/pytorch-cifar
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class Block(nn.Module):
    '''Grouped convolution block.'''
    expansion = 2

    def __init__(self, in_planes, cardinality=32, bottleneck_width=4, stride=1):
        super(Block, self).__init__()
        group_width = cardinality * bottleneck_width
        self.conv1 = nn.Conv2d(in_planes, group_width, kernel_size=1, bias=True)
        self.bn1 = nn.BatchNorm2d(group_width)
        self.conv2 = nn.Conv2d(group_width, group_width, kernel_size=3, stride=stride, padding=1, groups=cardinality, bias=True)
        # self.bn2 = nn.BatchNorm2d(group_width)
        self.conv3 = nn.Conv2d(group_width, self.expansion*group_width, kernel_size=1, bias=True)
        # self.bn3 = nn.BatchNorm2d(self.expansion*group_width)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*group_width:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*group_width, kernel_size=1, stride=stride, bias=True),
                # nn.BatchNorm2d(self.expansion*group_width)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        # out = F.relu(self.bn2(self.conv2(out)))
        # out = self.bn3(self.conv3(out))
        # out = F.relu(self.conv1(x))
        out = F.relu(self.conv2(out))
        out = self.conv3(out)
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class ResNeXt(nn.Module):
    def __init__(self, num_blocks, cardinality, bottleneck_width, num_classes=200):
        super(ResNeXt, self).__init__()
        self.cardinality = cardinality
        self.bottleneck_width = bottleneck_width
        self.in_planes = 16

        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, bias=True, padding=1)
        # self.bn1 = nn.BatchNorm2d(16)
        self.layer1 = self._make_layer(num_blocks[0], 1)
        self.layer2 = self._make_layer(num_blocks[1], 2)
        self.layer3 = self._make_layer(num_blocks[2], 2)
        # self.layer4 = self._make_layer(num_blocks[3], 2)
        self.linear1 = nn.Linear(cardinality*bottleneck_width*1568, 512)
        self.linear2 = nn.Linear(512, num_classes)


    def _make_layer(self, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(Block(self.in_planes, self.cardinality, self.bottleneck_width, stride))
            self.in_planes = Block.expansion * self.cardinality * self.bottleneck_width
        # Increase bottleneck_width by 2 after each stage.
        self.bottleneck_width *= 2
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.conv1(x))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = torch.flatten(out, 1)
        out = F.relu(self.linear1(out))
        out = self.linear2(out)
        return out

def ResNeXt_imagenet64():
    return ResNeXt(num_blocks=[2,2,2], cardinality=2, bottleneck_width=8)

if __name__ == "__main__":
    from thop import profile
    net = ResNeXt_imagenet64()
    x = torch.randn(1,3,56,56)
    y = net(x)
    print(net)
    macs, params = profile(net, (torch.randn(1, 3, 56, 56),))
    print(macs / 1000000, params / 1000000)  # 64M, 13M
    print(y.shape)

================================================
FILE: examples/vision/models/vnncomp_resnet.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1, bn=True, kernel=3):
        super(BasicBlock, self).__init__()
        self.bn = bn
        if kernel == 3:
            self.conv1 = nn.Conv2d(
                in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=(not self.bn))
            if self.bn:
                self.bn1 = nn.BatchNorm2d(planes)
            self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                                   stride=1, padding=1, bias=(not self.bn))
        elif kernel == 2:
            self.conv1 = nn.Conv2d(
                in_planes, planes, kernel_size=2, stride=stride, padding=1, bias=(not self.bn))
            if self.bn:
                self.bn1 = nn.BatchNorm2d(planes)
            self.conv2 = nn.Conv2d(planes, planes, kernel_size=2,
                                   stride=1, padding=0, bias=(not self.bn))
        elif kernel == 1:
            self.conv1 = nn.Conv2d(
                in_planes, planes, kernel_size=1, stride=stride, padding=0, bias=(not self.bn))
            if self.bn:
                self.bn1 = nn.BatchNorm2d(planes)
            self.conv2 = nn.Conv2d(planes, planes, kernel_size=1,
                                   stride=1, padding=0, bias=(not self.bn))
        else:
            exit("kernel not supported!")

        if self.bn:
            self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            if self.bn:
                self.shortcut = nn.Sequential(
                    nn.Conv2d(in_planes, self.expansion*planes,
                              kernel_size=1, stride=stride, bias=(not self.bn)),
                    nn.BatchNorm2d(self.expansion*planes)
                )
            else:
                self.shortcut = nn.Sequential(
                    nn.Conv2d(in_planes, self.expansion*planes,
                              kernel_size=1, stride=stride, bias=(not self.bn)),
                )

    def forward(self, x):
        if self.bn:
            out = F.relu(self.bn1(self.conv1(x)))
            out = self.bn2(self.conv2(out))
        else:
            out = F.relu(self.conv1(x))
            out = self.conv2(out)
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class ResNet5(nn.Module):
    def __init__(self, block, num_blocks=2, num_classes=10, in_planes=64, bn=True, last_layer="avg"):
        super(ResNet5, self).__init__()
        self.in_planes = in_planes
        self.bn = bn
        self.last_layer = last_layer
        self.conv1 = nn.Conv2d(3, in_planes, kernel_size=3,
                               stride=2, padding=1, bias=not self.bn)
        if self.bn: self.bn1 = nn.BatchNorm2d(in_planes)
        self.layer1 = self._make_layer(block, in_planes*2, num_blocks, stride=2, bn=bn, kernel=3)
        if self.last_layer == "avg":
            self.avg2d = nn.AvgPool2d(4)
            self.linear = nn.Linear(in_planes * 8 * block.expansion, num_classes)
        elif self.last_layer == "dense":
            self.linear1 = nn.Linear(in_planes * 8 * block.expansion * 16, 100)
            self.linear2 = nn.Linear(100, num_classes)
        else:
            exit("last_layer type not supported!")

    def _make_layer(self, block, planes, num_blocks, stride, bn, kernel):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride, bn, kernel))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        if self.bn:
            out = F.relu(self.bn1(self.conv1(x)))
        else:
            out = F.relu(self.conv1(x))
        out = self.layer1(out)
        if self.last_layer == "avg":
            out = self.avg2d(out)
            out = torch.flatten(out, 1)
            out = self.linear(out)
        elif self.last_layer == "dense":
            out = torch.flatten(out, 1)
            out = F.relu(self.linear1(out))
            out = self.linear2(out)
        return out


class ResNet9(nn.Module):
    def __init__(self, block, num_blocks=2, num_classes=10, in_planes=64, bn=True, last_layer="avg"):
        super(ResNet9, self).__init__()
        self.in_planes = in_planes
        self.bn = bn
        self.last_layer = last_layer
        self.conv1 = nn.Conv2d(3, in_planes, kernel_size=3,
                               stride=2, padding=1, bias=not self.bn)
        if self.bn: self.bn1 = nn.BatchNorm2d(in_planes)
        self.layer1 = self._make_layer(block, in_planes*2, num_blocks, stride=2, bn=bn, kernel=3)
        self.layer2 = self._make_layer(block, in_planes*2, num_blocks, stride=2, bn=bn, kernel=3)
        if self.last_layer == "avg":
            self.avg2d = nn.AvgPool2d(4)
            self.linear = nn.Linear(in_planes * 2 * block.expansion, num_classes)
        elif self.last_layer == "dense":
            self.linear1 = nn.Linear(in_planes * 2 * block.expansion * 16, 100)
            self.linear2 = nn.Linear(100, num_classes)
        else:
            exit("last_layer type not supported!")

    def _make_layer(self, block, planes, num_blocks, stride, bn, kernel):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride, bn, kernel))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        if self.bn:
            out = F.relu(self.bn1(self.conv1(x)))
        else:
            out = F.relu(self.conv1(x))
        out = self.layer1(out)
        out = self.layer2(out)
        if self.last_layer == "avg":
            out = self.avg2d(out)
            out = torch.flatten(out, 1)
            out = self.linear(out)
        elif self.last_layer == "dense":
            out = torch.flatten(out, 1)
            out = F.relu(self.linear1(out))
            out = self.linear2(out)
        return out


def resnet2b():
    return ResNet5(BasicBlock, num_blocks=2, in_planes=8, bn=False, last_layer="dense")

def resnet4b():
    return ResNet9(BasicBlock, num_blocks=2, in_planes=16, bn=False, last_layer="dense")


if __name__ == '__main__':
    print('ResNet-2B:\n', resnet2b())
    print('ResNet-4B:\n', resnet4b())


================================================
FILE: examples/vision/models/wide_resnet_cifar.py
================================================
import torch
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
from torch.autograd import Variable

import sys
import numpy as np

def conv3x3(in_planes, out_planes, stride=1):
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=True)

def conv_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        init.xavier_uniform_(m.weight, gain=np.sqrt(2))
        init.constant_(m.bias, 0)
    elif classname.find('BatchNorm') != -1:
        init.constant_(m.weight, 1)
        init.constant_(m.bias, 0)

class wide_basic(nn.Module):
    def __init__(self, in_planes, planes, dropout_rate, stride=1, use_bn=False):
        super(wide_basic, self).__init__()
        self.use_bn = use_bn
        self.dropout_rate = dropout_rate
        if use_bn:
            self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, padding=1, bias=True)
        if dropout_rate:
            self.dropout = nn.Dropout(p=dropout_rate)
        # self.bn2 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=True)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=True),
            )

    def forward(self, x):
        # out = self.dropout(self.conv1(F.relu(self.bn1(x))))
        if self.use_bn:
            out = self.conv1(F.relu(self.bn1(x)))
        else:
            out = self.conv1(F.relu(x))
        if self.dropout_rate:
            out = self.dropout(out)
        # out = self.conv2(F.relu(self.bn2(out)))
        out = self.conv2(F.relu(out))

        out += self.shortcut(x)

        return out

class Wide_ResNet(nn.Module):
    def __init__(self, depth, widen_factor, dropout_rate, num_classes, use_bn=False, use_pooling=True):
        super(Wide_ResNet, self).__init__()
        self.in_planes = 16
        self.use_bn = use_bn
        self.use_pooling = use_pooling
        assert ((depth-4)%6 ==0), 'Wide-resnet depth should be 6n+4'
        n = (depth-4)/6
        k = widen_factor

        print('| Wide-Resnet %dx%d' %(depth, k))
        nStages = [self.in_planes, self.in_planes*2*k, self.in_planes*4*k, self.in_planes*8*k]

        self.conv1 = conv3x3(3,nStages[0])
        self.layer1 = self._wide_layer(wide_basic, nStages[1], n, dropout_rate, stride=1)
        self.layer2 = self._wide_layer(wide_basic, nStages[2], n, dropout_rate, stride=2)
        self.layer3 = self._wide_layer(wide_basic, nStages[3], n, dropout_rate, stride=2)
        # self.bn1 = nn.BatchNorm2d(nStages[3], momentum=0.1)
        if self.use_pooling:
            self.linear1 = nn.Linear(nStages[3], 512)
        else:
            self.linear1 = nn.Linear(nStages[3]*64, 512)

        self.linear2 = nn.Linear(512, num_classes)


    def _wide_layer(self, block, planes, num_blocks, dropout_rate, stride):
        strides = [stride] + [1]*(int(num_blocks)-1)
        layers = []

        for stride in strides:
            layers.append(block(self.in_planes, planes, dropout_rate, stride, self.use_bn))
            self.in_planes = planes

        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv1(x)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = F.relu(out)
        if self.use_pooling:
            out = F.avg_pool2d(out, 8)
        out = torch.flatten(out, 1)
        out = F.relu(self.linear1(out))
        out = self.linear2(out)

        return out

def wide_resnet_cifar(in_ch=3, in_dim=32):
    return Wide_ResNet(16, 4, 0.3, 10)

def wide_resnet_cifar_bn(in_ch=3, in_dim=32):
    return Wide_ResNet(10, 4, None, 10, use_bn=True)

def wide_resnet_cifar_bn_wo_pooling(in_ch=3, in_dim=32): # 1113M, 21M
    return Wide_ResNet(10, 4, None, 10, use_bn=True, use_pooling=False)

def wide_resnet_cifar_bn_wo_pooling_dropout(in_ch=3, in_dim=32): # 1113M, 21M
    return Wide_ResNet(10, 4, 0.3, 10, use_bn=True, use_pooling=False)

if __name__ == '__main__':
    from thop import profile
    net = wide_resnet_cifar_bn_wo_pooling_dropout()
    print(net)
    y = net(torch.randn(1,3,32,32))
    macs, params = profile(net, (torch.randn(1, 3, 32, 32),))
    print(macs/1000000, params/1000000)  # 1096M, 5M
    print(y.size())

================================================
FILE: examples/vision/models/wide_resnet_imagenet64.py
================================================
import torch
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F

import sys
import numpy as np

def conv3x3(in_planes, out_planes, stride=1):
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=True)

def conv_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        init.xavier_uniform_(m.weight, gain=np.sqrt(2))
        init.constant_(m.bias, 0)
    elif classname.find('BatchNorm') != -1:
        init.constant_(m.weight, 1)
        init.constant_(m.bias, 0)

class wide_basic(nn.Module):
    def __init__(self, in_planes, planes, dropout_rate, stride=1):
        super(wide_basic, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, padding=1, bias=True)
        # self.dropout = nn.Dropout(p=dropout_rate)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=True)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=True),
            )

    def forward(self, x):
        # out = self.dropout(self.conv1(F.relu(self.bn1(x))))
        out = self.conv1(F.relu(self.bn1(x)))
        out = self.conv2(F.relu(self.bn2(out)))
        out += self.shortcut(x)

        return out

class Wide_ResNet(nn.Module):
    def __init__(self, depth, widen_factor, dropout_rate, num_classes,
            in_planes=16, in_dim=56):
        super(Wide_ResNet, self).__init__()
        self.in_planes = in_planes

        assert ((depth-4)%6 ==0), 'Wide-resnet depth should be 6n+4'
        n = (depth-4)/6
        k = widen_factor

        print('| Wide-Resnet %dx%d' %(depth, k))
        nStages = [in_planes, in_planes*k, in_planes*2*k, in_planes*4*k]

        self.conv1 = conv3x3(3,nStages[0])
        self.layer1 = self._wide_layer(wide_basic, nStages[1], n, dropout_rate, stride=1)
        self.layer2 = self._wide_layer(wide_basic, nStages[2], n, dropout_rate, stride=2)
        self.layer3 = self._wide_layer(wide_basic, nStages[3], n, dropout_rate, stride=2)
        self.bn1 = nn.BatchNorm2d(nStages[3], momentum=0.1)
        self.linear = nn.Linear(nStages[3] * (in_dim//4//7)**2, num_classes)

    def _wide_layer(self, block, planes, num_blocks, dropout_rate, stride):
        strides = [stride] + [1]*(int(num_blocks)-1)
        layers = []

        for stride in strides:
            layers.append(block(self.in_planes, planes, dropout_rate, stride))
            self.in_planes = planes

        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv1(x)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = F.relu(self.bn1(out))
        out = F.avg_pool2d(out, 7)
        out = torch.flatten(out, 1)
        out = self.linear(out)

        return out

def wide_resnet_imagenet64(in_ch=3, in_dim=56, in_planes=16, widen_factor=10):
    return Wide_ResNet(10, widen_factor, 0.3, 200, in_dim=in_dim, in_planes=in_planes)

def wide_resnet_imagenet64_1000class(in_ch=3, in_dim=56, in_planes=16, widen_factor=10):
    return Wide_ResNet(10, widen_factor, 0.3, 1000, in_dim=in_dim, in_planes=in_planes)

if __name__ == '__main__':
    from thop import profile
    net = wide_resnet_imagenet64_1000class()
    print(net)
    y = net(torch.randn(1,3,56,56))
    macs, params = profile(net, (torch.randn(1, 3, 56, 56),))
    print(macs, params)  # 5229M, 8M
    print(y.size())

================================================
FILE: examples/vision/save_intermediate_bound.py
================================================
"""
A simple example for saving intermediate bounds.
"""
import os
import torch
import torch.nn as nn
import torchvision
from auto_LiRPA import BoundedModule, BoundedTensor
from auto_LiRPA.perturbations import PerturbationLpNorm
from auto_LiRPA.utils import Flatten

def mnist_model():
    model = nn.Sequential(
        nn.Conv2d(1, 16, 4, stride=2, padding=1),
        nn.ReLU(),
        nn.Conv2d(16, 32, 4, stride=2, padding=1),
        nn.ReLU(),
        Flatten(),
        nn.Linear(32*7*7,100),
        nn.ReLU(),
        nn.Linear(100, 10)
    )
    return model

model = mnist_model()
# Optionally, load the pretrained weights.
checkpoint = torch.load(
    os.path.join(os.path.dirname(__file__), 'pretrained/mnist_a_adv.pth'),
    map_location=torch.device('cpu'))
model.load_state_dict(checkpoint)

test_data = torchvision.datasets.MNIST(
    './data', train=False, download=True,
    transform=torchvision.transforms.ToTensor())
# For illustration we only use 2 image from dataset
N = 2
n_classes = 10
image = test_data.data[:N].view(N,1,28,28)
true_label = test_data.targets[:N]
# Convert to float
image = image.to(torch.float32) / 255.0
if torch.cuda.is_available():
    image = image.cuda()
    model = model.cuda()

lirpa_model = BoundedModule(model, torch.empty_like(image), device=image.device)
print('Running on', image.device)

eps = 0.3
norm = float("inf")
ptb = PerturbationLpNorm(norm = norm, eps = eps)
image = BoundedTensor(image, ptb)

lirpa_model.set_bound_opts({'optimize_bound_args': {'iteration': 20, 'lr_alpha': 0.1, }})
lb, ub = lirpa_model.compute_bounds(x=(image,), method='CROWN-Optimized')
# Intermediate layer bounds are returned as a dictionary, and if an argument is given,
# a pytorch checkpoint will also be saved to disk.
save_dict = lirpa_model.save_intermediate('./mnist_a_adv_bounds.pt')
# To avoid saving the file and get just the bounds, call without any arguments:
# save_dict = lirpa_model.save_intermediate()


================================================
FILE: examples/vision/simple_training.py
================================================
"""
A simple script to train certified defense using the auto_LiRPA library.

We compute output bounds under input perturbations using auto_LiRPA, and use
them to form a "robust loss" for certified defense.  Several different bound
options are supported, such as IBP, CROWN, and CROWN-IBP. This is a basic
example on MNIST and CIFAR-10 datasets with Lp (p>=0) norm perturbation. For
faster training, please see our examples with loss fusion such as
cifar_training.py and tinyimagenet_training.py
"""

import time
import random
import multiprocessing
import argparse
import torch.optim as optim
from torch.nn import CrossEntropyLoss
from auto_LiRPA import BoundedModule, BoundedTensor
from auto_LiRPA.perturbations import *
from auto_LiRPA.utils import MultiAverageMeter
from auto_LiRPA.eps_scheduler import LinearScheduler, AdaptiveScheduler, SmoothedScheduler, FixedScheduler
import models
import torchvision.datasets as datasets
import torchvision.transforms as transforms

parser = argparse.ArgumentParser()

parser.add_argument("--verify", action="store_true", help='verification mode, do not train')
parser.add_argument("--load", type=str, default="", help='Load pretrained model')
parser.add_argument("--device", type=str, default="cuda", choices=["cpu", "cuda"], help='use cpu or cuda')
parser.add_argument("--data", type=str, default="MNIST", choices=["MNIST", "CIFAR"], help='dataset')
parser.add_argument("--seed", type=int, default=100, help='random seed')
parser.add_argument("--eps", type=float, default=0.3, help='Target training epsilon')
parser.add_argument("--norm", type=float, default='inf', help='p norm for epsilon perturbation')
parser.add_argument("--bound_type", type=str, default="CROWN-IBP",
                    choices=["IBP", "CROWN-IBP", "CROWN", "CROWN-FAST"], help='method of bound analysis')
parser.add_argument("--model", type=str, default="resnet", help='model name (mlp_3layer, cnn_4layer, cnn_6layer, cnn_7layer, resnet)')
parser.add_argument("--num_epochs", type=int, default=100, help='number of total epochs')
parser.add_argument("--batch_size", type=int, default=256, help='batch size')
parser.add_argument("--lr", type=float, default=5e-4, help='learning rate')
parser.add_argument("--scheduler_name", type=str, default="SmoothedScheduler",
                    choices=["LinearScheduler", "AdaptiveScheduler", "SmoothedScheduler", "FixedScheduler"], help='epsilon scheduler')
parser.add_argument("--scheduler_opts", type=str, default="start=3,length=60", help='options for epsilon scheduler')
parser.add_argument("--bound_opts", type=str, default=None, choices=["same-slope", "zero-lb", "one-lb"],
                    help='bound options')
parser.add_argument("--conv_mode", type=str, choices=["matrix", "patches"], default="patches")
parser.add_argument("--save_model", type=str, default='')

args = parser.parse_args()


def Train(model, t, loader, eps_scheduler, norm, train, opt, bound_type, method='robust'):
    num_class = 10
    meter = MultiAverageMeter()
    if train:
        model.train()
        eps_scheduler.train()
        eps_scheduler.step_epoch()
        eps_scheduler.set_epoch_length(int((len(loader.dataset) + loader.batch_size - 1) / loader.batch_size))
    else:
        model.eval()
        eps_scheduler.eval()

    for i, (data, labels) in enumerate(loader):
        start = time.time()
        eps_scheduler.step_batch()
        eps = eps_scheduler.get_eps()
        # For small eps just use natural training, no need to compute LiRPA bounds
        batch_method = method
        if eps < 1e-20:
            batch_method = "natural"
        if train:
            opt.zero_grad()
        # generate specifications
        c = torch.eye(num_class).type_as(data)[labels].unsqueeze(1) - torch.eye(num_class).type_as(data).unsqueeze(0)
        # remove specifications to self
        I = (~(labels.data.unsqueeze(1) == torch.arange(num_class).type_as(labels.data).unsqueeze(0)))
        c = (c[I].view(data.size(0), num_class - 1, num_class))
        # bound input for Linf norm used only
        if norm == np.inf:
            data_max = torch.reshape((1. - loader.mean) / loader.std, (1, -1, 1, 1))
            data_min = torch.reshape((0. - loader.mean) / loader.std, (1, -1, 1, 1))
            data_ub = torch.min(data + (eps / loader.std).view(1,-1,1,1), data_max)
            data_lb = torch.max(data - (eps / loader.std).view(1,-1,1,1), data_min)
        else:
            data_ub = data_lb = data

        if list(model.parameters())[0].is_cuda:
            data, labels, c = data.cuda(), labels.cuda(), c.cuda()
            data_lb, data_ub = data_lb.cuda(), data_ub.cuda()

        # Specify Lp norm perturbation.
        # When using Linf perturbation, we manually set element-wise bound x_L and x_U. eps is not used for Linf norm.
        if norm > 0:
            ptb = PerturbationLpNorm(norm=norm, eps=eps, x_L=data_lb, x_U=data_ub)
        elif norm == 0:
            ptb = PerturbationL0Norm(eps = eps_scheduler.get_max_eps(), ratio = eps_scheduler.get_eps()/eps_scheduler.get_max_eps())
        x = BoundedTensor(data, ptb)

        output = model(x)
        regular_ce = CrossEntropyLoss()(output, labels)  # regular CrossEntropyLoss used for warming up
        meter.update('CE', regular_ce.item(), x.size(0))
        meter.update('Err', torch.sum(torch.argmax(output, dim=1) != labels).cpu().detach().numpy() / x.size(0), x.size(0))

        if batch_method == "robust":
            if bound_type == "IBP":
                lb, ub = model.compute_bounds(IBP=True, C=c, method=None)
            elif bound_type == "CROWN":
                lb, ub = model.compute_bounds(IBP=False, C=c, method="backward", bound_upper=False)
            elif bound_type == "CROWN-IBP":
                # lb, ub = model.compute_bounds(ptb=ptb, IBP=True, x=data, C=c, method="backward")  # pure IBP bound
                # we use a mixed IBP and CROWN-IBP bounds, leading to better performance (Zhang et al., ICLR 2020)
                factor = (eps_scheduler.get_max_eps() - eps) / eps_scheduler.get_max_eps()
                ilb, iub = model.compute_bounds(IBP=True, C=c, method=None)
                if factor < 1e-5:
                    lb = ilb
                else:
                    clb, cub = model.compute_bounds(IBP=False, C=c, method="backward", bound_upper=False)
                    lb = clb * factor + ilb * (1 - factor)
            elif bound_type == "CROWN-FAST":
                # Similar to CROWN-IBP but no mix between IBP and CROWN bounds.
                lb, ub = model.compute_bounds(IBP=True, C=c, method=None)
                lb, ub = model.compute_bounds(IBP=False, C=c, method="backward", bound_upper=False)


            # Pad zero at the beginning for each example, and use fake label "0" for all examples
            lb_padded = torch.cat((torch.zeros(size=(lb.size(0),1), dtype=lb.dtype, device=lb.device), lb), dim=1)
            fake_labels = torch.zeros(size=(lb.size(0),), dtype=torch.int64, device=lb.device)
            robust_ce = CrossEntropyLoss()(-lb_padded, fake_labels)
        if batch_method == "robust":
            loss = robust_ce
        elif batch_method == "natural":
            loss = regular_ce
        if train:
            loss.backward()
            eps_scheduler.update_loss(loss.item() - regular_ce.item())
            opt.step()
        meter.update('Loss', loss.item(), data.size(0))
        if batch_method != "natural":
            meter.update('Robust_CE', robust_ce.item(), data.size(0))
            # For an example, if lower bounds of margins is >0 for all classes, the output is verifiably correct.
            # If any margin is < 0 this example is counted as an error
            meter.update('Verified_Err', torch.sum((lb < 0).any(dim=1)).item() / data.size(0), data.size(0))
        meter.update('Time', time.time() - start)
        if i % 50 == 0 and train:
            print('[{:2d}:{:4d}]: eps={:.8f} {}'.format(t, i, eps, meter))
    print('[{:2d}:{:4d}]: eps={:.8f} {}'.format(t, i, eps, meter))

def main(args):
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    random.seed(args.seed)
    np.random.seed(args.seed)

    ## Step 1: Initial original model as usual, see model details in models/example_feedforward.py and models/example_resnet.py
    if args.data == 'MNIST':
        model_ori = models.Models[args.model](in_ch=1, in_dim=28)
    else:
        model_ori = models.Models[args.model](in_ch=3, in_dim=32)
    if args.load:
        state_dict = torch.load(args.load)['state_dict']
        model_ori.load_state_dict(state_dict)

    ## Step 2: Prepare dataset as usual
    if args.data == 'MNIST':
        dummy_input = torch.randn(2, 1, 28, 28)
        train_data = datasets.MNIST("./data", train=True, download=True, transform=transforms.ToTensor())
        test_data = datasets.MNIST("./data", train=False, download=True, transform=transforms.ToTensor())
    elif args.data == 'CIFAR':
        dummy_input = torch.randn(2, 3, 32, 32)
        normalize = transforms.Normalize(mean = [0.4914, 0.4822, 0.4465], std = [0.2023, 0.1994, 0.2010])
        train_data = datasets.CIFAR10("./data", train=True, download=True,
                transform=transforms.Compose([
                    transforms.RandomHorizontalFlip(),
                    transforms.RandomCrop(32, 4),
                    transforms.ToTensor(),
                    normalize]))
        test_data = datasets.CIFAR10("./data", train=False, download=True, 
                transform=transforms.Compose([transforms.ToTensor(), normalize]))

    train_data = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=min(multiprocessing.cpu_count(),4))
    test_data = torch.utils.data.DataLoader(test_data, batch_size=args.batch_size, pin_memory=True, num_workers=min(multiprocessing.cpu_count(),4))
    if args.data == 'MNIST':
        train_data.mean = test_data.mean = torch.tensor([0.0])
        train_data.std = test_data.std = torch.tensor([1.0])
    elif args.data == 'CIFAR':
        train_data.mean = test_data.mean = torch.tensor([0.4914, 0.4822, 0.4465])
        train_data.std = test_data.std = torch.tensor([0.2023, 0.1994, 0.2010])

    ## Step 3: wrap model with auto_LiRPA
    # The second parameter dummy_input is for constructing the trace of the computational graph.
    model = BoundedModule(model_ori, dummy_input, bound_opts={'activation_bound_option':args.bound_opts, 'conv_mode': args.conv_mode}, device=args.device)

    ## Step 4 prepare optimizer, epsilon scheduler and learning rate scheduler
    opt = optim.Adam(model.parameters(), lr=args.lr)
    norm = float(args.norm)
    lr_scheduler = optim.lr_scheduler.StepLR(opt, step_size=10, gamma=0.5)
    eps_scheduler = eval(args.scheduler_name)(args.eps, args.scheduler_opts)
    print("Model structure: \n", str(model_ori))

    ## Step 5: start training
    if args.verify:
        eps_scheduler = FixedScheduler(args.eps)
        with torch.no_grad():
            Train(model, 1, test_data, eps_scheduler, norm, False, None, args.bound_type)
    else:
        timer = 0.0
        for t in range(1, args.num_epochs+1):
            if eps_scheduler.reached_max_eps():
                # Only decay learning rate after reaching the maximum eps
                lr_scheduler.step()
            print("Epoch {}, learning rate {}".format(t, lr_scheduler.get_lr()))
            start_time = time.time()
            Train(model, t, train_data, eps_scheduler, norm, True, opt, args.bound_type)
            epoch_time = time.time() - start_time
            timer += epoch_time
            print('Epoch time: {:.4f}, Total time: {:.4f}'.format(epoch_time, timer))
            print("Evaluating...")
            with torch.no_grad():
                Train(model, t, test_data, eps_scheduler, norm, False, None, args.bound_type)
            torch.save({'state_dict': model_ori.state_dict(), 'epoch': t}, args.save_model if args.save_model != "" else args.model)


if __name__ == "__main__":
    main(args)


================================================
FILE: examples/vision/simple_verification.py
================================================
"""
A simple example for bounding neural network outputs under input perturbations.

This example serves as a skeleton for robustness verification of neural networks.
"""
import os
from collections import defaultdict
import torch
import torch.nn as nn
import torchvision
from auto_LiRPA import BoundedModule, BoundedTensor
from auto_LiRPA.perturbations import PerturbationLpNorm
from auto_LiRPA.utils import Flatten

## Step 1: Define computational graph by implementing forward()
# This simple model comes from https://github.com/locuslab/convex_adversarial
def mnist_model():
    model = nn.Sequential(
        nn.Conv2d(1, 16, 4, stride=2, padding=1),
        nn.ReLU(),
        nn.Conv2d(16, 32, 4, stride=2, padding=1),
        nn.ReLU(),
        Flatten(),
        nn.Linear(32*7*7,100),
        nn.ReLU(),
        nn.Linear(100, 10)
    )
    return model

model = mnist_model()
# Optionally, load the pretrained weights.
checkpoint = torch.load(
    os.path.join(os.path.dirname(__file__), 'pretrained/mnist_a_adv.pth'),
    map_location=torch.device('cpu'))
model.load_state_dict(checkpoint)

## Step 2: Prepare dataset as usual
test_data = torchvision.datasets.MNIST(
    './data', train=False, download=True,
    transform=torchvision.transforms.ToTensor())
# For illustration we only use 2 image from dataset
N = 2
n_classes = 10
image = test_data.data[:N].view(N,1,28,28)
true_label = test_data.targets[:N]
# Convert to float
image = image.to(torch.float32) / 255.0
if torch.cuda.is_available():
    image = image.cuda()
    model = model.cuda()

## Step 3: wrap model with auto_LiRPA
# The second parameter is for constructing the trace of the computational graph,
# and its content is not important.
lirpa_model = BoundedModule(model, torch.empty_like(image), device=image.device)
print('Running on', image.device)
# Visualize the lirpa_model
# Visualization file is saved as "bounded_mnist_model.png" or "bounded_mnist_model.dot"
lirpa_model.visualize("bounded_mnist_model")
print()

## Step 4: Compute bounds using LiRPA given a perturbation
eps = 0.3
norm = float("inf")
ptb = PerturbationLpNorm(norm = norm, eps = eps)
image = BoundedTensor(image, ptb)
# Get model prediction as usual
pred = lirpa_model(image)
label = torch.argmax(pred, dim=1).cpu().detach().numpy()
print('Demonstration 1: Bound computation and comparisons of different methods.\n')

## Step 5: Compute bounds for final output
for method in [
        'IBP', 'IBP+backward (CROWN-IBP)', 'backward (CROWN)',
        'CROWN-Optimized (alpha-CROWN)']:
    print('Bounding method:', method)
    if 'Optimized' in method:
        # For optimized bound, you can change the number of iterations, learning rate, etc here. Also you can increase verbosity to see per-iteration loss values.
        lirpa_model.set_bound_opts({'optimize_bound_args': {'iteration': 20, 'lr_alpha': 0.1}})
    lb, ub = lirpa_model.compute_bounds(x=(image,), method=method.split()[0])
    for i in range(N):
        print(f'Image {i} top-1 prediction {label[i]} ground-truth {true_label[i]}')
        for j in range(n_classes):
            indicator = '(ground-truth)' if j == true_label[i] else ''
            print('f_{j}(x_0): {l:8.3f} <= f_{j}(x_0+delta) <= {u:8.3f} {ind}'.format(
                j=j, l=lb[i][j].item(), u=ub[i][j].item(), ind=indicator))
    print()


print('Demonstration 2: Obtaining linear coefficients of the lower and upper bounds.\n')
# There are many bound coefficients during CROWN bound calculation; here we are interested in the linear bounds
# of the output layer, with respect to the input layer (the image).
required_A = defaultdict(set)
required_A[lirpa_model.output_name[0]].add(lirpa_model.input_name[0])

# Helper functions to concretize the linear bounds
def concretize_bound(A, bias, xL, xU, upper: bool):
    """
    Concretize linear bound.
    If upper is True: use A_pos * xU + A_neg * xL + bias
    If upper is False: use A_pos * xL + A_neg * xU + bias
    """
    A_pos = torch.clamp(A, min=0.0)
    A_neg = torch.clamp(A, max=0.0)
    if upper:
        return (
            torch.einsum("boijk,boijk->bo", A_pos, xU)
            + torch.einsum("boijk,boijk->bo", A_neg, xL)
            + bias
        )
    else:
        return (
            torch.einsum("boijk,boijk->bo", A_pos, xL)
            + torch.einsum("boijk,boijk->bo", A_neg, xU)
            + bias
        )

# Prepare input bounds
x_L = (image - eps).unsqueeze(1)
x_U = (image + eps).unsqueeze(1)

for method in [
        'IBP+backward (CROWN-IBP)', 'backward (CROWN)', 'CROWN',
        'CROWN-Optimized (alpha-CROWN)']:
    print("Bounding method:", method)
    if 'Optimized' in method:
        # For optimized bound, you can change the number of iterations, learning rate, etc here. Also you can increase verbosity to see per-iteration loss values.
        lirpa_model.set_bound_opts({'optimize_bound_args': {'iteration': 30, 'lr_alpha': 0.1}})
    lb, ub, A_dict = lirpa_model.compute_bounds(x=(image,), method=method.split()[0], return_A=True, needed_A_dict=required_A)
    lower_A, lower_bias = A_dict[lirpa_model.output_name[0]][lirpa_model.input_name[0]]['lA'], A_dict[lirpa_model.output_name[0]][lirpa_model.input_name[0]]['lbias']
    upper_A, upper_bias = A_dict[lirpa_model.output_name[0]][lirpa_model.input_name[0]]['uA'], A_dict[lirpa_model.output_name[0]][lirpa_model.input_name[0]]['ubias']
    print(f'lower bound linear coefficients size (batch, output_dim, *input_dims): {list(lower_A.size())}')
    print(f'lower bound linear coefficients norm (smaller is better): {lower_A.norm()}')
    print(f'lower bound bias term size (batch, output_dim): {list(lower_bias.size())}')
    print(f'lower bound bias term sum (larger is better): {lower_bias.sum()}')
    print(f'upper bound linear coefficients size (batch, output_dim, *input_dims): {list(upper_A.size())}')
    print(f'upper bound linear coefficients norm (smaller is better): {upper_A.norm()}')
    print(f'upper bound bias term size (batch, output_dim): {list(upper_bias.size())}')
    print(f'upper bound bias term sum (smaller is better): {upper_bias.sum()}')
    print(f'These linear lower and upper bounds are valid everywhere within the perturbation radii.\n')

    # Validate the concretization of the linear bounds
    concretized_lb = concretize_bound(lower_A, lower_bias, x_L, x_U, upper=False)
    concretized_ub = concretize_bound(upper_A, upper_bias, x_L, x_U, upper=True)
    assert torch.allclose(
        concretized_lb, lb, rtol=1e-4, atol=1e-5), "Lower bound mismatch! Error: {}".format((concretized_lb - lb).abs().max())
    assert torch.allclose(
        concretized_ub, ub, rtol=1e-4, atol=1e-5), "Upper bound mismatch! Error: {}".format((concretized_ub - ub).abs().max())


## An example for computing margin bounds.
# In compute_bounds() function you can pass in a specification matrix C, which is a final linear matrix applied to the last layer NN output.
# For example, if you are interested in the margin between the groundtruth class and another class, you can use C to specify the margin.
# This generally yields tighter bounds.
# Here we compute the margin between groundtruth class and groundtruth class + 1.
# If you have more than 1 specifications per batch element, you can expand the second dimension of C (it is 1 here for demonstration).
lirpa_model = BoundedModule(model, torch.empty_like(image), device=image.device)
C = torch.zeros(size=(N, 1, n_classes), device=image.device)
groundtruth = true_label.to(device=image.device).unsqueeze(1).unsqueeze(1)
target_label = (groundtruth + 1) % n_classes
C.scatter_(dim=2, index=groundtruth, value=1.0)
C.scatter_(dim=2, index=target_label, value=-1.0)
print('Demonstration 3: Computing bounds with a specification matrix.\n')
print('Specification matrix:\n', C)

for method in ['IBP', 'IBP+backward (CROWN-IBP)', 'backward (CROWN)', 'CROWN-Optimized (alpha-CROWN)']:
    print('Bounding method:', method)
    if 'Optimized' in method:
        # For optimized bound, you can change the number of iterations, learning rate, etc here. Also you can increase verbosity to see per-iteration loss values.
        lirpa_model.set_bound_opts({'optimize_bound_args': {'iteration': 20, 'lr_alpha': 0.1, }})
    lb, ub = lirpa_model.compute_bounds(x=(image,), method=method.split()[0], C=C)
    for i in range(N):
        print('Image {} top-1 prediction {} ground-truth {}'.format(i, label[i], true_label[i]))
        print('margin bounds: {l:8.3f} <= f_{j}(x_0+delta) - f_{target}(x_0+delta) <= {u:8.3f}'.format(
            j=true_label[i], target=(true_label[i] + 1) % n_classes, l=lb[i][0].item(), u=ub[i][0].item()))
    print()


================================================
FILE: examples/vision/tinyimagenet_training.py
================================================
import os
import random
import time
import argparse
import multiprocessing
import logging
import torch.optim as optim
from torch.nn import CrossEntropyLoss
from auto_LiRPA import BoundedModule, BoundedTensor, BoundDataParallel, CrossEntropyWrapper
from auto_LiRPA.bound_ops import BoundExp
from auto_LiRPA.perturbations import *
from auto_LiRPA.utils import MultiAverageMeter, logger, get_spec_matrix, sync_params
import models
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from auto_LiRPA.eps_scheduler import *

def get_exp_module(bounded_module):
    for _, node in bounded_module.named_modules():
        # Find the Exp neuron in computational graph
        if isinstance(node, BoundExp):
            return node
    return None

parser = argparse.ArgumentParser()

parser.add_argument("--verify", action="store_true", help='verification mode, do not train')
parser.add_argument("--load", type=str, default="", help='Load pretrained model')
parser.add_argument("--device", type=str, default="cuda", choices=["cpu", "cuda"], help='use cpu or cuda')
parser.add_argument("--data_dir", type=str, default="data/tinyImageNet/tiny-imagenet-200",
                    help='dir of dataset')
parser.add_argument("--seed", type=int, default=100, help='random seed')
parser.add_argument("--eps", type=float, default=1. / 255, help='Target training epsilon')
parser.add_argument("--norm", type=float, default='inf', help='p norm for epsilon perturbation')
parser.add_argument("--bound_type", type=str, default="CROWN-IBP",
                    choices=["IBP", "CROWN-IBP", "CROWN"], help='method of bound analysis')
parser.add_argument("--model", type=str, default="wide_resnet_imagenet64",
                    help='model name (cnn_7layer_bn_imagenet, ResNeXt_imagenet64, ResNeXt_imagenet64)')
parser.add_argument("--num_epochs", type=int, default=600, help='number of total epochs')
parser.add_argument("--batch_size", type=int, default=128, help='batch size')
parser.add_argument("--lr", type=float, default=5e-4, help='learning rate')
parser.add_argument("--lr_decay_milestones", nargs='+', type=int, default=[600, 700], help='learning rate dacay milestones')
parser.add_argument("--scheduler_name", type=str, default="SmoothedScheduler",
                    choices=["LinearScheduler", "AdaptiveScheduler", "SmoothedScheduler"], help='epsilon scheduler')
parser.add_argument("--scheduler_opts", type=str, default="start=100,length=400,mid=0.4", help='options for epsilon scheduler')
parser.add_argument("--bound_opts", type=str, default=None, choices=["same-slope", "zero-lb", "one-lb"],
                    help='bound options')
parser.add_argument('--clip_grad_norm', type=float, default=8.0)
parser.add_argument('--in_planes', type=int, default=16)
parser.add_argument('--widen_factor', type=int, default=10)

args = parser.parse_args()

exp_name = args.model + '_b' + str(args.batch_size) + '_' + str(args.bound_type) + '_epoch' + str(
    args.num_epochs) + '_' + args.scheduler_opts + '_ImageNet_' + str(args.eps)[:6]
os.makedirs('saved_models/', exist_ok=True)
log_file = f'saved_models/{exp_name}{"_test" if args.verify else ""}.log'
file_handler = logging.FileHandler(log_file)
logger.addHandler(file_handler)

def Train(model, t, loader, eps_scheduler, norm, train, opt, bound_type, method='robust', loss_fusion=True,
          final_node_name=None):
    num_class = 200
    meter = MultiAverageMeter()
    if train:
        model.train()
        eps_scheduler.train()
        eps_scheduler.step_epoch()
        eps_scheduler.set_epoch_length(int((len(loader.dataset) + loader.batch_size - 1) / loader.batch_size))
    else:
        model.eval()
        eps_scheduler.eval()

    exp_module = get_exp_module(model)

    def get_bound_loss(x=None, c=None):
        if loss_fusion:
            bound_lower, bound_upper = False, True
        else:
            bound_lower, bound_upper = True, False

        if bound_type == 'IBP':
            lb, ub = model(method_opt="compute_bounds", x=x, IBP=True, C=c, method=None,
                           final_node_name=final_node_name, no_replicas=True)
        elif bound_type == 'CROWN':
            lb, ub = model(method_opt="compute_bounds", x=x, IBP=False, C=c, method='backward',
                           bound_lower=bound_lower, bound_upper=bound_upper)
        elif bound_type == 'CROWN-IBP':
            # lb, ub = model.compute_bounds(ptb=ptb, IBP=True, x=data, C=c, method='backward')  # pure IBP bound
            # we use a mixed IBP and CROWN-IBP bounds, leading to better performance (Zhang et al., ICLR 2020)
            factor = (eps_scheduler.get_max_eps() - eps_scheduler.get_eps()) / eps_scheduler.get_max_eps()
            ilb, iub = model(method_opt="compute_bounds", x=x, IBP=True, C=c, method=None,
                             final_node_name=final_node_name, no_replicas=True)
            if factor < 1e-50:
                lb, ub = ilb, iub
            else:
                clb, cub = model(method_opt="compute_bounds", IBP=False, C=c, method='backward',
                                 bound_lower=bound_lower, bound_upper=bound_upper, final_node_name=final_node_name,
                                 no_replicas=True)
                if loss_fusion:
                    ub = cub * factor + iub * (1 - factor)
                else:
                    lb = clb * factor + ilb * (1 - factor)

        if loss_fusion:
            if isinstance(model, BoundDataParallel):
                max_input = model(get_property=True, node_class=BoundExp, att_name='max_input')
            else:
                max_input = exp_module.max_input
            return None, torch.mean(torch.log(ub) + max_input)
        else:
            # Pad zero at the beginning for each example, and use fake label '0' for all examples
            lb_padded = torch.cat((torch.zeros(size=(lb.size(0), 1), dtype=lb.dtype, device=lb.device), lb), dim=1)
            fake_labels = torch.zeros(size=(lb.size(0),), dtype=torch.int64, device=lb.device)
            robust_ce = CrossEntropyLoss()(-lb_padded, fake_labels)
            return lb, robust_ce

    for i, (data, labels) in enumerate(loader):
        start = time.time()
        eps_scheduler.step_batch()
        eps = eps_scheduler.get_eps()
        # For small eps just use natural training, no need to compute LiRPA bounds
        batch_method = method
        if eps < 1e-50:
            batch_method = "natural"
        if train:
            opt.zero_grad()
        # bound input for Linf norm used only
        if norm == np.inf:
            data_max = torch.reshape((1. - loader.mean) / loader.std, (1, -1, 1, 1))
            data_min = torch.reshape((0. - loader.mean) / loader.std, (1, -1, 1, 1))
            data_ub = torch.min(data + (eps / loader.std).view(1, -1, 1, 1), data_max)
            data_lb = torch.max(data - (eps / loader.std).view(1, -1, 1, 1), data_min)
        else:
            data_ub = data_lb = data

        if list(model.parameters())[0].is_cuda:
            data, labels = data.cuda(), labels.cuda()
            data_lb, data_ub = data_lb.cuda(), data_ub.cuda()

        ptb = PerturbationLpNorm(norm=norm, eps=eps, x_L=data_lb, x_U=data_ub)
        x = BoundedTensor(data, ptb)
        if loss_fusion:
            if batch_method == 'natural' or not train:
                output = model(x, labels)
                regular_ce = torch.mean(torch.log(output))
            else:
                model(x, labels)
                regular_ce = torch.tensor(0., device=data.device)
            meter.update('CE', regular_ce.item(), x.size(0))
            x = (x, labels)
            c = None
        else:
            c = get_spec_matrix(data, labels, num_class)
            x = (x, labels)
            output = model(x, final_node_name=final_node_name)
            regular_ce = CrossEntropyLoss()(output, labels)  # regular CrossEntropyLoss used for warming up
            meter.update('CE', regular_ce.item(), x[0].size(0))
            meter.update('Err', torch.sum(torch.argmax(output, dim=1) != labels).item() / x[0].size(0), x[0].size(0))

        if batch_method == 'robust':
            # print(data.sum())
            lb, robust_ce = get_bound_loss(x=x, c=c)
            loss = robust_ce
        elif batch_method == 'natural':
            loss = regular_ce

        if train:
            loss.backward()

            if args.clip_grad_norm:
                grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=args.clip_grad_norm)
                meter.update('grad_norm', grad_norm)

            if isinstance(eps_scheduler, AdaptiveScheduler):
                eps_scheduler.update_loss(loss.item() - regular_ce.item())
            opt.step()
        meter.update('Loss', loss.item(), data.size(0))

        if batch_method != 'natural':
            meter.update('Robust_CE', robust_ce.item(), data.size(0))
            if not loss_fusion:
                # For an example, if lower bounds of margins is >0 for all classes, the output is verifiably correct.
                # If any margin is < 0 this example is counted as an error
                meter.update('Verified_Err', torch.sum((lb < 0).any(dim=1)).item() / data.size(0), data.size(0))
        meter.update('Time', time.time() - start)

        if (i + 1) % 250 == 0 and train:
            logger.info('[{:2d}:{:4d}]: eps={:.12f} {}'.format(t, i + 1, eps, meter))

    logger.info('[{:2d}:{:4d}]: eps={:.12f} {}'.format(t, i + 1, eps, meter))
    return meter


def main(args):
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    random.seed(args.seed)
    np.random.seed(args.seed)

    ## Step 1: Initial original model as usual, see model details in models/example_feedforward.py and models/example_resnet.py
    model_ori = models.Models[args.model](in_planes=args.in_planes, widen_factor=args.widen_factor)
    epoch = 0
    if args.load:
        checkpoint = torch.load(args.load)
        epoch, state_dict, opt_state = checkpoint['epoch'], checkpoint['state_dict'], checkpoint.get('optimizer')
        for k, v in state_dict.items():
            assert torch.isnan(v).any().cpu().numpy() == 0 and torch.isinf(v).any().cpu().numpy() == 0
        model_ori.load_state_dict(state_dict)
        logger.info('Checkpoint loaded: {}'.format(args.load))

    ## Step 2: Prepare dataset as usual
    dummy_input = torch.randn(2, 3, 56, 56)
    normalize = transforms.Normalize(mean=[0.4802, 0.4481, 0.3975], std=[0.2302, 0.2265, 0.2262])
    train_data = datasets.ImageFolder(args.data_dir + '/train',
                                      transform=transforms.Compose([
                                          transforms.RandomHorizontalFlip(),
                                          transforms.RandomCrop(56, padding_mode='edge'),
                                          transforms.ToTensor(),
                                          normalize,
                                      ]))
    test_data = datasets.ImageFolder(args.data_dir + '/val',
                                     transform=transforms.Compose([
                                         # transforms.RandomResizedCrop(64, scale=(0.875, 0.875), ratio=(1., 1.)),
                                         transforms.CenterCrop(56),
                                         transforms.ToTensor(),
                                         normalize]))

    train_data = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, pin_memory=True,
                                             num_workers=min(multiprocessing.cpu_count(), 4))
    test_data = torch.utils.data.DataLoader(test_data, batch_size=args.batch_size // 5, pin_memory=True,
                                            num_workers=min(multiprocessing.cpu_count(), 4))
    train_data.mean = test_data.mean = torch.tensor([0.4802, 0.4481, 0.3975])
    train_data.std = test_data.std = torch.tensor([0.2302, 0.2265, 0.2262])

    ## Step 3: wrap model with auto_LiRPA
    # The second parameter dummy_input is for constructing the trace of the computational graph.
    model = BoundedModule(model_ori, dummy_input, bound_opts={'activation_bound_option':args.bound_opts}, device=args.device)
    model_loss = BoundedModule(CrossEntropyWrapper(model_ori), (dummy_input, torch.zeros(1, dtype=torch.long)),
                               bound_opts= { 'activation_bound_option': args.bound_opts, 'loss_fusion': True }, device=args.device)
    model_loss = BoundDataParallel(model_loss)

    ## Step 4 prepare optimizer, epsilon scheduler and learning rate scheduler
    opt = optim.Adam(model_loss.parameters(), lr=args.lr)
    norm = float(args.norm)
    lr_scheduler = optim.lr_scheduler.MultiStepLR(opt, milestones=args.lr_decay_milestones, gamma=0.1)
    eps_scheduler = eval(args.scheduler_name)(args.eps, args.scheduler_opts)
    logger.info(str(model_ori))

    if args.load:
        if opt_state:
            opt.load_state_dict(opt_state)
            logger.info('resume opt_state')

    # skip epochs
    if epoch > 0:
        epoch_length = int((len(train_data.dataset) + train_data.batch_size - 1) / train_data.batch_size)
        eps_scheduler.set_epoch_length(epoch_length)
        eps_scheduler.train()
        for i in range(epoch):
            lr_scheduler.step()
            eps_scheduler.step_epoch(verbose=True)
            for j in range(epoch_length):
                eps_scheduler.step_batch()
        logger.info('resume from eps={:.12f}'.format(eps_scheduler.get_eps()))

    ## Step 5: start training
    if args.verify:
        eps_scheduler = FixedScheduler(args.eps)
        with torch.no_grad():
            Train(model, 1, test_data, eps_scheduler, norm, False, None, 'IBP', loss_fusion=False, final_node_name=None)
    else:
        timer = 0.0
        best_err = 1e10
        for t in range(epoch + 1, args.num_epochs + 1):
            logger.info("Epoch {}, learning rate {}".format(t, lr_scheduler.get_last_lr()))
            start_time = time.time()
            Train(model_loss, t, train_data, eps_scheduler, norm, True, opt, args.bound_type, loss_fusion=True)
            lr_scheduler.step()
            epoch_time = time.time() - start_time
            timer += epoch_time
            logger.info('Epoch time: {:.4f}, Total time: {:.4f}'.format(epoch_time, timer))

            logger.info("Evaluating...")
            torch.cuda.empty_cache()

            state_dict = sync_params(model_ori, model_loss, loss_fusion=True)

            with torch.no_grad():
                if int(eps_scheduler.params['start']) + int(eps_scheduler.params['length']) > t >= int(
                        eps_scheduler.params['start']):
                    m = Train(model_loss, t, test_data, eps_scheduler, norm, False, None, args.bound_type, loss_fusion=True)
                else:
                    model_ori.load_state_dict(state_dict)
                    model = BoundedModule(model_ori, dummy_input, bound_opts={'activation_bound_option':args.bound_opts}, device=args.device)
                    model = BoundDataParallel(model)
                    m = Train(model, t, test_data, eps_scheduler, norm, False, None, 'IBP', loss_fusion=False)
                    del model

            save_dict = {'state_dict': state_dict, 'epoch': t, 'optimizer': opt.state_dict()}
            if t < int(eps_scheduler.params['start']):
                torch.save(save_dict, 'saved_models/natural_' + exp_name)
            elif t > int(eps_scheduler.params['start']) + int(eps_scheduler.params['length']):
                current_err = m.avg('Verified_Err')
                if current_err < best_err:
                    best_err = current_err
                    torch.save(save_dict, 'saved_models/' + exp_name + '_best_' + str(best_err)[:6])
            else:
                torch.save(save_dict, 'saved_models/' + exp_name)
            torch.cuda.empty_cache()


if __name__ == "__main__":
    logger.info(args)
    main(args)


================================================
FILE: examples/vision/verify_two_node.py
================================================
"""
Example for multi-node perturbation. An input image is splited to two parts
where each part is perturbed respectively constained by L-inf norm. It is
expected to output the same results as running `simple_verification.py` where
the whole image is perturbed constained by L-inf norm.
"""

import os
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from auto_LiRPA import BoundedModule, BoundedTensor
from auto_LiRPA.perturbations import *

## Step 1: Define computational graph by implementing forward()
class cnn_MNIST(nn.Module):
    def __init__(self):
        super(cnn_MNIST, self).__init__()
        self.conv1 = nn.Conv2d(1, 8, 4, stride=2, padding=1)
        self.conv2 = nn.Conv2d(8, 16, 4, stride=2, padding=1)
        self.fc1 = nn.Linear(784, 256)
        self.fc2 = nn.Linear(256, 10)

    def forward(self, x, y):
        x = torch.cat([x, y], dim=2) # concat the two parts of input
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = x.view(-1, 784)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

model = cnn_MNIST()
# Load the pretrained weights
checkpoint = torch.load(os.path.join(os.path.dirname(__file__),"pretrained/mnist_cnn_small.pth"),
                        map_location=torch.device('cpu'))
model.load_state_dict(checkpoint)

## Step 2: Prepare dataset as usual
test_data = torchvision.datasets.MNIST(
    "./data", train=False, download=True, transform=torchvision.transforms.ToTensor())
# For illustration we only use 2 image from dataset
N = 2
n_classes = 10
image = test_data.data[:N].view(N,1,28,28)
# Convert to float
image = image.to(torch.float32) / 255.0
if torch.cuda.is_available():
    image = image.cuda()
    model = model.cuda()

## Step 3: wrap model with auto_LiRPA
# The second parameter is for constructing the trace of the computational graph,
# and its content is not important.
image_1, image_2 = torch.split(torch.empty_like(image), [14, 14], dim=2)
model = BoundedModule(
    model, (image_1, image_2), device=image.device,
    bound_opts={'conv_mode': 'matrix'} # Patches mode is not supported currently
)

## Step 4: Compute bounds using LiRPA given a perturbation
eps = 0.3
norm = np.inf
ptb = PerturbationLpNorm(norm=norm, eps=eps)
image_1, image_2 = torch.split(image, [14, 14], dim=2)
image_1 = BoundedTensor(image_1, ptb)
image_2 = BoundedTensor(image_2, ptb)
# Get model prediction as usual
pred = model(image_1, image_2)
label = torch.argmax(pred, dim=1).cpu().numpy()
# Compute bounds
lb, ub = model.compute_bounds()

## Step 5: Final output
pred = pred.detach().cpu().numpy()
lb = lb.detach().cpu().numpy()
ub = ub.detach().cpu().numpy()
for i in range(N):
    print("Image {} top-1 prediction {}".format(i, label[i]))
    for j in range(n_classes):
        print("f_{j}(x_0) = {fx0:8.3f},   {l:8.3f} <= f_{j}(x_0+delta) <= {u:8.3f}".format(
            j=j, fx0=pred[i][j], l=lb[i][j], u=ub[i][j]))
    print()


================================================
FILE: examples/vision/weight_perturbation_training.py
================================================
"""
A simple example for certified robustness against model weight perturbations.

Since our framework works on general computational graphs, where both model
weights and model inputs are inputs of the computational graph, our
perturbation analysis can naturally be applied to the model weights, allowing
analysis for certified model robustness under weight perturbations. This file
provides a simple example of certified defense for model weight perturbations.

See our paper https://arxiv.org/abs/2002.12920 for more details.
"""
import random
import time
import os
import argparse
import logging
import torch.optim as optim
from torch.nn import CrossEntropyLoss
from auto_LiRPA import BoundedModule, CrossEntropyWrapper, BoundDataParallel, BoundedParameter
from auto_LiRPA.bound_ops import BoundExp
from auto_LiRPA.perturbations import *
from auto_LiRPA.utils import MultiAverageMeter, logger, get_spec_matrix
from datasets import mnist_loaders
import torchvision.datasets as datasets
import models
from auto_LiRPA.eps_scheduler import LinearScheduler, AdaptiveScheduler, SmoothedScheduler, FixedScheduler

def get_exp_module(bounded_module):
    for _, node in bounded_module.named_modules():
        # Find the Exp neuron in computational graph
        if isinstance(node, BoundExp):
            return node
    return None

parser = argparse.ArgumentParser()

parser.add_argument("--verify", action="store_true", help='verification mode, do not train')
parser.add_argument("--load", type=str, default="", help='Load pretrained model')
parser.add_argument("--device", type=str, default="cuda", choices=["cpu", "cuda"], help='use cpu or cuda')
parser.add_argument("--data", type=str, default="MNIST", choices=["MNIST", "FashionMNIST"], help='dataset')
parser.add_argument("--ratio", type=float, default=None, help='percent of training used, None means whole training data')
parser.add_argument("--seed", type=int, default=100, help='random seed')
parser.add_argument("--eps", type=float, default=0.1, help='Target training epsilon for weight perturbations')
parser.add_argument("--norm", type=float, default='inf', help='p norm for epsilon perturbation')
parser.add_argument("--bound_type", type=str, default="CROWN-IBP",
                    choices=["IBP", "CROWN-IBP", "CROWN"], help='method of bound analysis')
parser.add_argument("--opt", type=str, default='ADAM', choices=["ADAM", "SGD"], help='optimizer')
parser.add_argument("--num_epochs", type=int, default=150, help='number of total epochs')
parser.add_argument("--batch_size", type=int, default=256, help='batch size')
parser.add_argument("--lr", type=float, default=0.001, help='learning rate')
parser.add_argument("--lr_decay_milestones", nargs='+', type=int, default=[120, 140], help='learning rate dacay milestones')
parser.add_argument("--scheduler_name", type=str, default="LinearScheduler",
                    choices=["LinearScheduler", "AdaptiveScheduler", "SmoothedScheduler"], help='epsilon scheduler')
parser.add_argument("--scheduler_opts", type=str, default="start=10,length=100", help='options for epsilon scheduler')
parser.add_argument("--bound_opts", type=str, default=None, choices=["same-slope", "zero-lb", "one-lb"],
                    help='bound options')
parser.add_argument('--clip_grad_norm', type=float, default=8.0)
parser.add_argument('--truncate_data', type=int, help='Truncate the training/test batches in unit test')
parser.add_argument('--multigpu', action='store_true', help='MultiGPU training')

num_class = 10
args = parser.parse_args()
exp_name = 'mlp_MNIST'+'_b'+str(args.batch_size)+'_'+str(args.bound_type)+'_epoch'+str(args.num_epochs)+'_'+args.scheduler_opts+'_'+str(args.eps)[:6]
log_file = f'{exp_name}{"_test" if args.verify else ""}.log'
file_handler = logging.FileHandler(log_file)
logger.addHandler(file_handler) 

## Training one epoch.
def Train(model, t, loader, eps_scheduler, norm, train, opt, bound_type, method='robust', loss_fusion=True, final_node_name=None):
    meter = MultiAverageMeter()
    if train:
        model.train()
        eps_scheduler.train()
        eps_scheduler.step_epoch(verbose=False)
        eps_scheduler.set_epoch_length(int((len(loader.dataset) + loader.batch_size - 1) / loader.batch_size))
    else:
        model.eval()
        eps_scheduler.eval()
    
    # Used for loss-fusion. Get the exp operation in computational graph.
    exp_module = get_exp_module(model)

    def get_bound_loss(x=None, c=None):
        if loss_fusion:
            # When loss fusion is used, we need the upper bound for the final loss function.
            bound_lower, bound_upper = False, True
        else:
            # When loss fusion is not used, we need the lower bound for the logit layer.
            bound_lower, bound_upper = True, False

        if bound_type == 'IBP':
            lb, ub = model(method_opt="compute_bounds", x=x, C=c, method="IBP", final_node_name=final_node_name, no_replicas=True)
        elif bound_type == 'CROWN':
            lb, ub = model(method_opt="compute_bounds", x=x, C=c, method="backward",
                                          bound_lower=bound_lower, bound_upper=bound_upper)
        elif bound_type == 'CROWN-IBP':
            # we use a mixed IBP and CROWN-IBP bounds, leading to better performance (Zhang et al., ICLR 2020)
            # factor = (eps_scheduler.get_max_eps() - eps_scheduler.get_eps()) / eps_scheduler.get_max_eps()
            ilb, iub = model(method_opt="compute_bounds", x=x, C=c, method="IBP", final_node_name=final_node_name, no_replicas=True)
            lb, ub = model(method_opt="compute_bounds", C=c, method="CROWN-IBP",
                         bound_lower=bound_lower, bound_upper=bound_upper, final_node_name=final_node_name, average_A=True, no_replicas=True)
        if loss_fusion:
            # When loss fusion is enabled, we need to get the common factor before softmax.
            if isinstance(model, BoundDataParallel):
                max_input = model(get_property=True, node_class=BoundExp, att_name='max_input')
            else:
                max_input = exp_module.max_input
            return None, torch.mean(torch.log(ub) + max_input)
        else:
            # Pad zero at the beginning for each example, and use fake label '0' for all examples
            lb_padded = torch.cat((torch.zeros(size=(lb.size(0), 1), dtype=lb.dtype, device=lb.device), lb), dim=1)
            fake_labels = torch.zeros(size=(lb.size(0),), dtype=torch.int64, device=lb.device)
            robust_ce = CrossEntropyLoss()(-lb_padded, fake_labels)
            return lb, robust_ce

    for i, (data, labels) in enumerate(loader):
        # For unit test. We only use a small number of batches
        if args.truncate_data:
            if i >= args.truncate_data:
                break

        start = time.time()
        eps_scheduler.step_batch()
        eps = eps_scheduler.get_eps()
        # For small eps just use natural training, no need to compute LiRPA bounds
        batch_method = method
        if eps < 1e-50:
            batch_method = "natural"
        if train:
            opt.zero_grad()

        if list(model.parameters())[0].is_cuda:
            data, labels = data.cuda(), labels.cuda()

        model.ptb.eps = eps
        x = data
        if loss_fusion:
            if batch_method == 'natural' or not train:
                output = model(x, labels)  # , disable_multi_gpu=True
                regular_ce = torch.mean(torch.log(output))
            else:
                model(x, labels)
                regular_ce = torch.tensor(0., device=data.device)
            meter.update('CE', regular_ce.item(), x.size(0))
            x = (x, labels)
            c = None
        else:
            # Generate speicification matrix (when loss fusion is not used).
            c = get_spec_matrix(data, labels, num_class)
            x = (x, labels)
            output = model(x, final_node_name=final_node_name)
            regular_ce = CrossEntropyLoss()(output, labels)  # regular CrossEntropyLoss used for warming up
            meter.update('CE', regular_ce.item(), x[0].size(0))
            meter.update('Err', torch.sum(torch.argmax(output, dim=1) != labels).item() / x[0].size(0), x[0].size(0))

        if batch_method == 'robust':
            lb, robust_ce = get_bound_loss(x=x, c=c)
            loss = robust_ce
        elif batch_method == 'natural':
            loss = regular_ce

        if train:
            loss.backward()

            if args.clip_grad_norm:
                grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=args.clip_grad_norm)
                meter.update('grad_norm', grad_norm)

            if isinstance(eps_scheduler, AdaptiveScheduler):
                eps_scheduler.update_loss(loss.item() - regular_ce.item())
            opt.step()
        meter.update('Loss', loss.item(), data.size(0))

        if batch_method != 'natural':
            meter.update('Robust_CE', robust_ce.item(), data.size(0))
            if not loss_fusion:
                # For an example, if lower bounds of margins is >0 for all classes, the output is verifiably correct.
                # If any margin is < 0 this example is counted as an error
                meter.update('Verified_Err', torch.sum((lb < 0).any(dim=1)).item() / data.size(0), data.size(0))
        meter.update('Time', time.time() - start)

        if (i + 1) % 50 == 0 and train:
            logger.info('[{:2d}:{:4d}]: eps={:.12f} {}'.format(t, i + 1, eps, meter))

    logger.info('[{:2d}:{:4d}]: eps={:.12f} {}'.format(t, i + 1, eps, meter))
    return meter


def main(args):
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    random.seed(args.seed)
    np.random.seed(args.seed)

    ## Load the model with BoundedParameter for weight perturbation.
    model_ori = models.Models['mlp_3layer_weight_perturb']()

    epoch = 0
    ## Load a checkpoint, if requested.
    if args.load:
        checkpoint = torch.load(args.load)
        epoch, state_dict = checkpoint['epoch'], checkpoint['state_dict']
        opt_state = None
        try:
            opt_state = checkpoint['optimizer']
        except KeyError:
            print('no opt_state found')
        for k, v in state_dict.items():
            assert torch.isnan(v).any().cpu().numpy() == 0 and torch.isinf(v).any().cpu().numpy() == 0
        model_ori.load_state_dict(state_dict)
        logger.info('Checkpoint loaded: {}'.format(args.load))

    ## Step 2: Prepare dataset as usual
    dummy_input = torch.randn(2, 1, 28, 28)
    train_data,  test_data = mnist_loaders(datasets.MNIST, batch_size=args.batch_size, ratio=args.ratio)
    train_data.mean = test_data.mean = torch.tensor([0.0])
    train_data.std = test_data.std = torch.tensor([1.0])

    ## Step 3: wrap model with auto_LiRPA
    # The second parameter dummy_input is for constructing the trace of the computational graph.
    model = BoundedModule(model_ori, dummy_input, device=args.device, bound_opts={
        'activation_bound_option':args.bound_opts, 'sparse_intermediate_bounds': False,
        'sparse_conv_intermediate_bounds': False, 'sparse_intermediate_bounds_with_ibp': False})
    final_name1 = model.final_name
    model_loss = BoundedModule(CrossEntropyWrapper(model_ori), (dummy_input, torch.zeros(1, dtype=torch.long)),
            device=args.device, bound_opts= {'activation_bound_option': args.bound_opts, 'loss_fusion': True,
                                             'sparse_intermediate_bounds': False,
                                             'sparse_conv_intermediate_bounds': False,
                                             'sparse_intermediate_bounds_with_ibp': False})

    # after CrossEntropyWrapper, the final name will change because of one more input node in CrossEntropyWrapper
    final_name2 = model_loss._modules[final_name1].output_name[0]
    assert type(model._modules[final_name1]) == type(model_loss._modules[final_name2])
    
    if args.multigpu:
        model_loss = BoundDataParallel(model_loss)
        
    model_loss.ptb = model.ptb = model_ori.ptb  # Perturbation on the parameters

    ## Step 4 prepare optimizer, epsilon scheduler and learning rate scheduler
    if args.opt == 'ADAM':
        opt = optim.Adam(model_loss.parameters(), lr=args.lr, weight_decay=0.01)
    elif args.opt == 'SGD':
        opt = optim.SGD(model_loss.parameters(), lr=args.lr, weight_decay=0.01)

    norm = float(args.norm)
    lr_scheduler = optim.lr_scheduler.MultiStepLR(opt, milestones=args.lr_decay_milestones, gamma=0.1)
    eps_scheduler = eval(args.scheduler_name)(args.eps, args.scheduler_opts)
    logger.info(str(model_ori))

    # Skip epochs if we continue training from a checkpoint.
    if epoch > 0:
        epoch_length = int((len(train_data.dataset) + train_data.batch_size - 1) / train_data.batch_size)
        eps_scheduler.set_epoch_length(epoch_length)
        eps_scheduler.train()
        for i in range(epoch):
            lr_scheduler.step()
            eps_scheduler.step_epoch(verbose=True)
            for j in range(epoch_length):
                eps_scheduler.step_batch()
        logger.info('resume from eps={:.12f}'.format(eps_scheduler.get_eps()))

    if args.load:
        if opt_state:
            opt.load_state_dict(opt_state)
            logger.info('resume opt_state')

    ## Step 5: start training.
    if args.verify:
        eps_scheduler = FixedScheduler(args.eps)
        with torch.no_grad():
            Train(model_loss, 1, test_data, eps_scheduler, norm, False, None, args.bound_type, loss_fusion=False, final_node_name=final_name2)
    else:
        timer = 0.0
        best_loss = 1e10
        # Main training loop
        for t in range(epoch + 1, args.num_epochs+1):
            logger.info("Epoch {}, learning rate {}".format(t, lr_scheduler.get_last_lr()))
            start_time = time.time()

            # Training one epoch
            Train(model_loss, t, train_data, eps_scheduler, norm, True, opt, args.bound_type, loss_fusion=True)
            lr_scheduler.step()
            epoch_time = time.time() - start_time
            timer += epoch_time
            logger.info('Epoch time: {:.4f}, Total time: {:.4f}'.format(epoch_time, timer))

            logger.info("Evaluating...")
            torch.cuda.empty_cache()

            state_dict = model_loss.state_dict()

            # Test one epoch.
            with torch.no_grad():
                m = Train(model, t, test_data, eps_scheduler, norm, False, None, args.bound_type,
              loss_fusion=False, final_node_name=final_name1)

            # Save checkpoints.
            save_dict = {'state_dict': state_dict, 'epoch': t, 'optimizer': opt.state_dict()}
            if not os.path.exists('saved_models'):
                os.mkdir('saved_models')
            if t < int(eps_scheduler.params['start']):
                torch.save(save_dict, 'saved_models/natural_' + exp_name)
            elif t > int(eps_scheduler.params['start']) + int(eps_scheduler.params['length']):
                current_loss = m.avg('Loss')
                if current_loss < best_loss:
                    best_loss = current_loss
                    torch.save(save_dict, 'saved_models/' + exp_name + '_best_' + str(best_loss)[:6])
                else:
                    torch.save(save_dict, 'saved_models/' + exp_name)
            else:
                torch.save(save_dict, 'saved_models/' + exp_name)
            torch.cuda.empty_cache()


if __name__ == "__main__":
    main(args)


================================================
FILE: setup.py
================================================
from setuptools import setup, find_packages
from pathlib import Path

# Check PyTorch version
pytorch_version_l = '2.0.0'
pytorch_version_u = '2.9.0' # excluded
torchvision_version_l = '0.12.0'
torchvision_version_u = '0.24.0' # excluded
msg_install_pytorch = (f'It is recommended to manually install PyTorch '
                    f'(>={pytorch_version_l},<{pytorch_version_u}) suitable '
                    'for your system ahead: https://pytorch.org/get-started.\n')
try:
    import torch
    if torch.__version__ < pytorch_version_l:
        print(f'PyTorch version {torch.__version__} is too low. '
                        + msg_install_pytorch)
    if torch.__version__ >= pytorch_version_u:
        print(f'PyTorch version {torch.__version__} is too high. '
                        + msg_install_pytorch)
except ModuleNotFoundError:
    print(f'PyTorch is not installed. {msg_install_pytorch}')

with open('auto_LiRPA/__init__.py') as file:
    for line in file.readlines():
        if '__version__' in line:
            version = eval(line.strip().split()[-1])

this_directory = Path(__file__).parent
long_description = (this_directory / 'README.md').read_text()

print(f'Installing auto_LiRPA {version}')
setup(
    name='auto_LiRPA',
    version=version,
    description='A library for Automatic Linear Relaxation based Perturbation Analysis (LiRPA) on general computational graphs, with a focus on adversarial robustness verification and certification of deep neural networks.',
    long_description=long_description,
    long_description_content_type='text/markdown',
    url='https://github.com/Verified-Intelligence/auto_LiRPA',
    author='α,β-CROWN Team',
    author_email='huan@huan-zhang.com, xiangru4@illinois.edu',
    packages=find_packages(),
    install_requires=[
        f'torch>={pytorch_version_l},<{pytorch_version_u}',
        f'torchvision>={torchvision_version_l},<{torchvision_version_u}',
        'numpy>=1.20',
        'packaging>=20.0',
        'pytest==8.1.1',
        'pylint>=2.15',
        'pytest-order>=1.0.0',
        'pytest-mock>=3.14',
        'appdirs>=1.4',
        'pyyaml>=5.0',
        'ninja>=1.10',
        'tqdm>=4.64',
        'graphviz>=0.20.3'
    ],
    platforms=['any'],
    license='BSD',
)


================================================
FILE: tests/.gitignore
================================================
.cache


================================================
FILE: tests/data/.gitignore
================================================
cifar-10-python.tar.gz
cifar-10-batches-py
MNIST

================================================
FILE: tests/test_1d_activation.py
================================================
"""Test one dimensional activation functions (e.g., ReLU, tanh, exp, sin, etc)"""
import functools

import pytest
import torch
import torch.nn as nn
from auto_LiRPA import BoundedModule, BoundedTensor
from auto_LiRPA.perturbations import *
from auto_LiRPA.utils import logger
from auto_LiRPA.operators.s_shaped import TanhGradOp, SigmoidGradOp
from testcase import TestCase, DEFAULT_DEVICE, DEFAULT_DTYPE

# Wrap the computation with a nn.Module
class test_model(nn.Module):
    def __init__(self, act_func):
        super().__init__()
        self.act_func = act_func

    def forward(self, x):
        return self.act_func(x)

def pow_2(x):
    return torch.pow(x, 2)

def pow_3(x):
    return torch.pow(x, 3)

class GELUOp(torch.autograd.Function):
    @staticmethod
    def symbolic(g, x):
        return g.op('custom::Gelu', x)

    @staticmethod
    def forward(ctx, x):
        return torch.nn.functional.gelu(x)

def GELU(x):
    return GELUOp.apply(x)

def gen_hardtanh(min_val, max_val):
   return functools.partial(torch.nn.functional.hardtanh, min_val=min_val, max_val=max_val)

# The original tanhgrad and sigmoidgrad also take in the gradient from the following layer
# and multiply it. Here we only implement the part that computes the local gradient.
def tanhgrad(x):
    return TanhGradOp.apply(x)

def sigmoidgrad(x):
    return SigmoidGradOp.apply(x)


class Test1DActivation(TestCase):
    def __init__(self, methodName='runTest', device=DEFAULT_DEVICE, dtype=DEFAULT_DTYPE):
        super().__init__(methodName, device=device, dtype=dtype)

    def create_test(self, act_func, low, high, ntests=1000, nsamples=1000,
                    method='IBP', activation_bound_option='adaptive', input_lb=None, input_ub=None):
        print(f'Testing activation {act_func} (method {method}, activation_bound_option {activation_bound_option})')

        model = test_model(act_func)
        image = torch.zeros(1, ntests)
        bounded_model = BoundedModule(
            model, image, bound_opts={
                'optimize_bound_args': {'iteration': 2},
                'activation_bound_option': activation_bound_option
            }, device=self.default_device)

        if input_lb is None or input_ub is None:
            # Generate randomly bounded inputs.
            p = torch.rand(1, ntests) * (high - low) + low
            q = torch.rand(1, ntests) * (high - low) + low
            input_lb = torch.min(p, q)
            input_ub = torch.max(p, q)
        else:
            low, high = torch.min(input_lb), torch.max(input_ub)
        input_center = (input_lb + input_ub) / 2.0
        ptb = PerturbationLpNorm(norm=float("inf"), eps=None, x_L=input_lb, x_U=input_ub)
        ptb_data = BoundedTensor(input_center, ptb)

        # Generate reference results.
        table = act_func(torch.linspace(start=low, end=high, steps=nsamples+1))
        def lookup(l, u):
            assert torch.all(u <= high)
            assert torch.all(l >= low)
            shape = l.size()
            l = l.squeeze()
            u = u.squeeze()
            # select all sample points between l and u.
            low_index = torch.ceil((l - low) / (high - low) * nsamples).int()  # Make sure we do not have index 0.
            high_index = torch.floor((u - low) / (high - low) * nsamples).int()
            real_lb = torch.empty_like(l)
            real_ub = torch.empty_like(u)
            for i, (li, hi) in enumerate(zip(low_index, high_index)):
                if li == hi + 1:
                    # Not enough precision. l and u are too close so we cannot tell.
                    real_lb[i] = float("inf")
                    real_ub[i] = float("-inf")
                else:
                    selected = table[li : hi+1]
                    real_lb[i] = torch.min(selected)
                    real_ub[i] = torch.max(selected)
            real_lb = real_lb.view(*shape)
            real_ub = real_ub.view(*shape)
            return real_lb, real_ub

        # These are reference results. IBP results should be very close to these.
        # Linear bound results can be looser than these.
        ref_forward = model(input_center)
        ref_output_lb, ref_output_ub = lookup(input_lb, input_ub)

        # Get bounding results.
        forward = bounded_model(ptb_data)
        output_lb, output_ub = bounded_model.compute_bounds(
            x=(ptb_data,), method=method)
        bounded_model.set_bound_opts({
            'optimize_bound_args': {'iteration': 2, 'init_alpha': True},
        })

        # Compare.
        assert torch.allclose(forward, ref_forward)
        for i in range(ntests):
            show = False
            if output_ub[0,i] < ref_output_ub[0,i] - 1e-5:
                logger.warning(f'upper bound is wrong {ref_output_ub[0,i] - output_ub[0,i]}')
                show = True
            if output_lb[0,i] > ref_output_lb[0,i] + 1e-5:
                logger.warning(f'lower bound is wrong {output_lb[0,i] - ref_output_lb[0,i]}')
                show = True
            if show:
                logger.warning(f'input_lb={input_lb[0,i]:8.3f}, input_ub={input_ub[0,i]:8.3f}, lb={output_lb[0,i]:8.3f}, ref_lb={ref_output_lb[0,i]:8.3f}, ub={output_ub[0,i]:8.3f}, ref_ub={ref_output_ub[0,i]:8.3f}')
        assert torch.all(output_ub + 1e-5 >= ref_output_ub)
        assert torch.all(output_lb - 1e-5 <= ref_output_lb)

    @pytest.mark.skip(reason="Known issue: https://github.com/Verified-Intelligence/Verifier_Development/issues/164")
    def test_tan(self):
        # Test tan(x) in different periods.
        for i in range(-5, 5):
            self.create_test(
                act_func=torch.tan,
                low=-0.5*torch.pi + i*torch.pi + 1e-20,
                high=0.5*torch.pi + i*torch.pi - 1e-20, method='IBP')
            self.create_test(
                act_func=torch.tan,
                low=-0.5*torch.pi + i*torch.pi + 1e-20,
                high=0.5*torch.pi + i*torch.pi - 1e-20, method='CROWN')

    def test_acts(self):
        for act_func in [torch.nn.functional.relu,
                         torch.sin, torch.cos,
                         torch.tanh, torch.sigmoid, torch.arctan,
                         torch.exp, pow_2, pow_3,
                         torch.sign, GELU, gen_hardtanh(-1,1),gen_hardtanh(-0.25,0.25),gen_hardtanh(1,10),gen_hardtanh(-5,2),
                         tanhgrad, sigmoidgrad]:
            low, high = -10, 10
            if act_func == torch.reciprocal:
                # So far only positive values are supported.
                low = 0.01
            self.create_test(act_func=act_func, low=low, high=high, method='IBP')
            self.create_test(act_func=act_func, low=low, high=high, method='CROWN')
            if act_func not in [torch.exp, torch.sign, torch.sin, torch.cos, tanhgrad, sigmoidgrad]:
                # Use optimized bounds
                self.create_test(act_func=act_func, low=low, high=high,
                                 method='CROWN-Optimized')
            if act_func in [torch.sin, torch.cos]:
                test_samples = 10
                for _ in range(test_samples):
                    self.create_test(act_func=act_func, low=low, high=high, method='CROWN-Optimized')

            if act_func in [torch.nn.functional.relu]:
                self.create_test(act_func=act_func, low=low, high=high, method='Dynamic-Forward')
            if act_func in [torch.nn.functional.relu, torch.tanh]:
                self.create_test(act_func=act_func, low=low, high=high, method='CROWN', activation_bound_option='same-slope')

        print('Testing activations with large input range')
        for act_func in [torch.sin, torch.tanh,
                        pow_3, GELU]:
            low, high = -600, 600
            self.create_test(act_func=act_func, low=low, high=high, method='CROWN')


if __name__ == '__main__':
    testcase = Test1DActivation()
    testcase.test_acts()


================================================
FILE: tests/test_2d_activation.py
================================================
"""Test two dimensional activation functions (e.g., min, max, etc)"""
import tqdm
import torch
import torch.nn as nn
from auto_LiRPA import BoundedModule, BoundedTensor
from auto_LiRPA.perturbations import *
from auto_LiRPA.utils import logger
from testcase import TestCase, DEFAULT_DEVICE, DEFAULT_DTYPE

# Wrap the computation with a nn.Module
class test_model(nn.Module):
    def __init__(self, act_func):
        super().__init__()
        self.act_func = act_func

    def forward(self, x, y):
        return self.act_func(x, y)


def mul(x, y):
    return x * y


class Test2DActivation(TestCase):
    def __init__(self, methodName='runTest', device=DEFAULT_DEVICE, dtype=DEFAULT_DTYPE):
        super().__init__(methodName, device=device, dtype=dtype)

    def create_test(self, act_func, low_x, high_x, low_y, high_y,
                    ntests=10000, nsamples=1000, method='IBP'):
        print(f'Testing activation {act_func}')

        model = test_model(act_func)
        image = torch.zeros(2, ntests)
        bounded_model = BoundedModule(model, (image[0], image[1]), device=self.default_device)

        # Generate randomly bounded inputs.
        p_x = torch.rand(1, ntests) * (high_x - low_x) + low_x
        q_x = torch.rand(1, ntests) * (high_x - low_x) + low_x
        input_lb_x = torch.min(p_x, q_x)
        input_ub_x = torch.max(p_x, q_x)
        input_center_x = (input_lb_x + input_ub_x) / 2.0
        ptb_x = PerturbationLpNorm(x_L=input_lb_x, x_U=input_ub_x)
        ptb_data_x = BoundedTensor(input_center_x, ptb_x)

        p_y = torch.rand(1, ntests) * (high_y - low_y) + low_y
        q_y = torch.rand(1, ntests) * (high_y - low_y) + low_y
        input_lb_y = torch.min(p_y, q_y)
        input_ub_y = torch.max(p_y, q_y)
        input_center_y = (input_lb_y + input_ub_y) / 2.0
        ptb_y = PerturbationLpNorm(x_L=input_lb_y, x_U=input_ub_y)
        ptb_data_y = BoundedTensor(input_center_y, ptb_y)

        # Generate reference results.
        range_xy = torch.linspace(start=low_x, end=high_x, steps=nsamples+1)
        table = torch.empty([range_xy.shape[0], range_xy.shape[0]])
        for i in range(range_xy.shape[0]):
            x = range_xy[i]
            table_y = act_func(x, torch.linspace(start=low_y, end=high_y, steps=nsamples+1))
            table[i] = table_y
        def lookup(l_x, u_x, l_y, u_y):
            assert torch.all(u_x <= high_x)
            assert torch.all(l_x >= low_x)
            assert torch.all(u_y <= high_y)
            assert torch.all(l_y >= low_y)
            shape = l_x.size()
            l_x = l_x.squeeze()
            u_x = u_x.squeeze()
            l_y = l_y.squeeze()
            u_y = u_y.squeeze()
            # select all sample points between l and u.
            low_index_x = torch.ceil((l_x - low_x) / (high_x - low_x) * nsamples).int()  # Make sure we do not have index 0.
            high_index_x = torch.floor((u_x - low_x) / (high_x - low_x) * nsamples).int()
            low_index_y = torch.ceil((l_y - low_y) / (high_y - low_y) * nsamples).int()  # Make sure we do not have index 0.
            high_index_y = torch.floor((u_y - low_y) / (high_y - low_y) * nsamples).int()
            real_lb = torch.empty_like(l_x)
            real_ub = torch.empty_like(u_x)
            for i, (li_x, hi_x) in enumerate(zip(low_index_x, high_index_x)):
                li_y = low_index_y[i]
                hi_y = high_index_y[i]
                if li_x == hi_x + 1 or li_y == hi_y + 1:
                    # Not enough precision. l and u are too close so we cannot tell.
                    real_lb[i] = float("inf")
                    real_ub[i] = float("-inf")
                else:
                    selected = table[li_x : hi_x+1, li_y : hi_y+1].reshape(-1)
                    real_lb[i] = torch.min(selected)
                    real_ub[i] = torch.max(selected)
            real_lb = real_lb.view(*shape)
            real_ub = real_ub.view(*shape)
            return real_lb, real_ub
        # These are reference results. IBP results should be very close to these. Linear bound results can be looser than these.
        ref_forward = model(input_center_x, input_center_y)
        ref_output_lb, ref_output_ub = lookup(input_lb_x, input_ub_x, input_lb_y, input_ub_y)

        # Get bounding results.
        forward = bounded_model(ptb_data_x, ptb_data_y)
        output_lb, output_ub = bounded_model.compute_bounds(x=(ptb_data_x, ptb_data_y), method = method)

        # Compare.
        assert torch.allclose(forward, ref_forward)
        for i in tqdm.tqdm(range(ntests)):
            show = False
            if output_ub[0,i] < ref_output_ub[0,i] - 1e-5:
                logger.warning(f'upper bound is wrong {ref_output_ub[0,i] - output_ub[0,i]}')
                show = True
            if output_lb[0,i] > ref_output_lb[0,i] + 1e-5:
                logger.warning(f'lower bound is wrong {output_lb[0,i] - ref_output_lb[0,i]}')
                show = True
            if show:
                logger.warning(f'input_lb_x={input_lb_x[0,i]:8.3f}, input_ub_x={input_ub_x[0,i]:8.3f},input_lb_y={input_lb_y[0,i]:8.3f}, input_ub_y={input_ub_y[0,i]:8.3f}, lb={output_lb[0,i]:8.3f}, ref_lb={ref_output_lb[0,i]:8.3f}, ub={output_ub[0,i]:8.3f}, ref_ub={ref_output_ub[0,i]:8.3f}')
        assert torch.all(output_ub + 1e-5 >= ref_output_ub)
        assert torch.all(output_lb - 1e-5 <= ref_output_lb)

    def test_max(self):
        self.create_test(act_func=torch.max, low_x=-10, high_x=5, low_y=-1, high_y=10, method='IBP')
        self.create_test(act_func=torch.max, low_x=-10, high_x=5, low_y=-1, high_y=10, method='CROWN')

    def test_min(self):
        self.create_test(act_func=torch.min, low_x=-10, high_x=5, low_y=-1, high_y=10, method='IBP')
        self.create_test(act_func=torch.min, low_x=-10, high_x=5, low_y=-1, high_y=10, method='CROWN')

    def test_mul(self):
        self.create_test(act_func=mul, low_x=-10, high_x=5, low_y=-1, high_y=10, method='IBP')
        self.create_test(act_func=mul, low_x=-10, high_x=5, low_y=-1, high_y=10, method='CROWN')

if __name__ == '__main__':
    testcase = Test2DActivation()
    testcase.test_max()
    testcase.test_min()
    testcase.test_mul()


================================================
FILE: tests/test_avgpool.py
================================================
import torch
import torch.nn as nn
import numpy as np
from auto_LiRPA import BoundedModule, BoundedTensor
from auto_LiRPA.perturbations import *
from testcase import TestCase, DEFAULT_DEVICE, DEFAULT_DTYPE


def ff(num_conv=2, num_mlp_only=None, pooling=False, activation="ReLU",
       hidden_size=256, input_ch=1, input_dim=28, num_classes=10, pool_kernel=3, pool_stride=1, pool_padding=1):
    activation = eval(f"nn.{activation}()")
    layers = []
    if num_conv:
        layers.append(nn.Conv2d(input_ch, 4, 3, stride=1, padding=1))
        layers.append(activation)
        num_channels = 4
        if pooling:
            layers.append(nn.AvgPool2d(kernel_size=pool_kernel, stride=pool_stride, padding=pool_padding))
        if num_conv >= 2:
            layers.append(nn.Conv2d(4, 8, 3, stride=1, padding=1))
            layers.append(nn.ReLU())
            if pooling:
                layers.append(nn.AvgPool2d(kernel_size=pool_kernel, stride=pool_stride, padding=pool_padding))
            num_channels = 8
        for _ in range(num_conv - 2):
            layers.append(nn.Conv2d(8, 8, 3, stride=1, padding=1))
            layers.append(nn.ReLU())
            if pooling:
                layers.append(nn.AvgPool2d(kernel_size=pool_kernel, stride=pool_stride, padding=pool_padding))
        layers.append(nn.Flatten(1))

        # Calculate output size after pooling operations
        if pooling and num_conv > 0:
            pooled_dim = input_dim
            for _ in range(num_conv):
                pooled_dim = (pooled_dim + 2 * pool_padding - pool_kernel) // pool_stride + 1
            linear_input_size = num_channels * (pooled_dim ** 2)
        else:
            linear_input_size = num_channels * (input_dim ** 2)

        layers.append(nn.Linear(linear_input_size, hidden_size))
        layers.append(nn.ReLU())
        layers.append(nn.Linear(hidden_size, num_classes))
    else:
        layers.append(nn.Flatten(1))
        cur = input_ch * (input_dim ** 2)
        for _ in range(num_mlp_only - 1):
            layers.append(nn.Linear(cur, hidden_size))
            layers.append(activation)
            cur = hidden_size
        layers.append(nn.Linear(hidden_size, num_classes))
    return nn.Sequential(*layers)


def synthetic_net(input_ch, input_dim, **kwargs):
    return ff(input_ch=input_ch, input_dim=input_dim, num_classes=2, **kwargs)


def synthetic_4c2f_pool(input_ch, input_dim, **kwargs):
    return synthetic_net(input_ch, input_dim, num_conv=4, pooling=True, **kwargs)


class TestAvgPool(TestCase):
    def __init__(self, methodName='runTest', generate=False, device=DEFAULT_DEVICE, dtype=DEFAULT_DTYPE):
        super().__init__(methodName,
            seed=1234, ref_name='avgpool_test_data',
            generate=generate, device=device, dtype=dtype)

    def test(self):
        test_configs = [
            {'input_ch': 1, 'input_dim': 5, 'hidden_size': 8, 'pool_kernel': 3, 'pool_stride': 1, 'pool_padding': 1},
            {'input_ch': 1, 'input_dim': 32, 'hidden_size': 16, 'pool_kernel': 2, 'pool_stride': 2, 'pool_padding': 0}
        ]

        self.result = []

        for config in test_configs:
            print(f"Testing config: {config}")

            model_ori = synthetic_4c2f_pool(**config)
            model_ori = model_ori.eval().to(self.default_device).to(self.default_dtype)

            x = torch.randn(8, config['input_ch'], config['input_dim'], config['input_dim'])

            ptb = PerturbationLpNorm(norm=np.inf, eps=100)
            x_bounded = BoundedTensor(x, ptb)

            print(f"  Testing with default conv_mode (patches)")
            model = BoundedModule(model_ori, x, device=self.default_device)

            lb_patches, ub_patches = model.compute_bounds(x=(x_bounded,), method='backward')
            print(f"    Patches mode - LB: {lb_patches}")
            print(f"    Patches mode - UB: {ub_patches}")

            self.result += [lb_patches, ub_patches]

            print(f"  Testing with conv_mode='matrix'")
            model_matrix = BoundedModule(model_ori, x, bound_opts={'conv_mode': 'matrix'})

            lb_matrix, ub_matrix = model_matrix.compute_bounds(x=(x_bounded,), method='backward')
            print(f"    Matrix mode - LB: {lb_matrix}")
            print(f"    Matrix mode - UB: {ub_matrix}")

            self.result += [lb_matrix, ub_matrix]

            lb_diff = torch.abs(lb_patches - lb_matrix).max().item()
            ub_diff = torch.abs(ub_patches - ub_matrix).max().item()
            print(f"    Max difference in LB between patches and matrix: {lb_diff}")
            print(f"    Max difference in UB between patches and matrix: {ub_diff}")

            assert torch.allclose(lb_patches, lb_matrix, atol=1e-6), f"Lower bounds not equivalent between patches and matrix modes"
            assert torch.allclose(ub_patches, ub_matrix, atol=1e-6), f"Upper bounds not equivalent between patches and matrix modes"
            print(f"    Matrix and patches modes produce equivalent results")
            print()

        self.check()


if __name__ == '__main__':
    testcase = TestAvgPool(generate=False)
    testcase.test()

================================================
FILE: tests/test_bound_ops.py
================================================
"""Test classes for bound operators"""
import torch
from auto_LiRPA.bound_ops import *
from auto_LiRPA.linear_bound import LinearBound
from testcase import TestCase, DEFAULT_DEVICE, DEFAULT_DTYPE


class Dummy:
    """Dummy node for testing"""
    def __init__(self, lower, upper=None, perturbed=False):
        self.lower = lower
        self.upper = upper if upper is not None else lower
        self.perturbed = perturbed
        self.output_shape = lower.shape


class TestBoundOp(TestCase):
    def __init__(self, methodName='runTest', generate=False,
                 device=DEFAULT_DEVICE, dtype=DEFAULT_DTYPE):
        super().__init__(methodName,
            seed=1, ref_name='bound_ops_data',
            generate=generate, device=device, dtype=dtype)

    def test(self):
        device = self.default_device
        dtype = self.default_dtype
        batch_size = 5
        dim_final = 7
        dim_output = 9
        dim_input = 11

        # multiplication of [batch_size, dim_input] and [dim_output, dim_input]^T
        weight = torch.randn(dim_output, dim_input, device=device)
        bias = torch.randn(dim_output, device=device)
        data_in = torch.randn(batch_size, dim_input, device=device)
        data_in_delta = torch.randn(batch_size, dim_input, device=device)
        dummy_in = Dummy(
            data_in - torch.abs(data_in_delta),
            data_in + torch.abs(data_in_delta), True)
        dummy_weight = Dummy(weight)
        dummy_bias = Dummy(bias)

        op = BoundLinear(
            attr={'transB': 1},
            inputs=[dummy_in, dummy_weight, dummy_bias],
            output_index=0, options={})
        op.batch_dim = 0

        # test `forward`
        data_out = op(data_in, weight, bias)
        self.assertEqual(data_out, data_in.matmul(weight.t()) + bias)

        # test `bound_backward`
        # The `transpose` here to make the randomization consistent with the previous reference.
        # It can be removed once a new reference is generated.
        last_lA = torch.randn(batch_size, dim_final, dim_output, device=device).transpose(0, 1)
        last_uA = torch.randn(batch_size, dim_final, dim_output, device=device).transpose(0, 1)
        A, lbias, ubias = op.bound_backward(last_lA, last_uA, *op.inputs)
        self.assertEqual(A[0][0], last_lA.matmul(weight))
        self.assertEqual(A[0][1], last_uA.matmul(weight))
        self.assertEqual(lbias, last_lA.matmul(bias))
        self.assertEqual(ubias, last_uA.matmul(bias))

        # test `bound_forward`
        # note that the upper bound may be actually smaller than the lower bound
        # in these dummy linear bounds
        bound_in = LinearBound(
            lw=torch.randn(batch_size, dim_final, dim_input, device=device),
            lb=torch.randn(batch_size, dim_input, device=device),
            uw=torch.randn(batch_size, dim_final, dim_input, device=device),
            ub=torch.randn(batch_size, dim_input, device=device),
            lower=None, upper=None)
        bound_weight = LinearBound(None, None, None, None, dummy_weight.lower, dummy_weight.upper)
        bound_bias = LinearBound(None, None, None, None, dummy_bias.lower, dummy_bias.upper)
        bound_out = op.bound_forward(dim_final, bound_in, bound_weight, bound_bias)
        self.assertEqual(
            bound_out.lw, bound_in.lw.matmul(weight.t().clamp(min=0))
            + bound_in.uw.matmul(weight.t().clamp(max=0)))
        self.assertEqual(
            bound_out.uw, bound_in.uw.matmul(weight.t().clamp(min=0))
            + bound_in.lw.matmul(weight.t().clamp(max=0)))
        self.assertEqual(
            bound_out.lb, bound_in.lb.matmul(weight.t().clamp(min=0))
            + bound_in.ub.matmul(weight.t().clamp(max=0)) + bias)
        self.assertEqual(
            bound_out.ub, bound_in.ub.matmul(weight.t().clamp(min=0))
            + bound_in.lb.matmul(weight.t().clamp(max=0)) + bias)

        # test `interval_propagate`
        bound_in = (
            torch.randn(*data_in.shape, device=device),
            torch.randn(*data_in.shape, device=device))
        bound_weight = (bound_weight.lower, bound_weight.upper)
        bound_bias = (bound_bias.lower, bound_bias.upper)
        bound_out = op.interval_propagate(bound_in, bound_weight, bound_bias)
        self.assertEqual(bound_out[0],
                         bound_in[0].matmul(weight.t().clamp(min=0))
                         + bound_in[1].matmul(weight.t().clamp(max=0)) + bias)
        self.assertEqual(bound_out[1],
                         bound_in[1].matmul(weight.t().clamp(min=0))
                         + bound_in[0].matmul(weight.t().clamp(max=0)) + bias)

        # test weight perturbation
        # `bound_backward`
        ptb_weight = torch.randn(weight.shape)
        op.inputs[1].upper += ptb_weight
        op.inputs[1].perturbed = True
        op.inputs[2].perturbation = None # no perturbation on bias
        A, lbias, ubias = op.bound_backward(last_lA, last_uA, *op.inputs)
        # `interval_propagate`
        bound_weight = (op.inputs[1].lower, op.inputs[1].upper)
        bound_out = op.interval_propagate(bound_in, bound_weight, bound_bias)

        self.result = (A, lbias, ubias, bound_out)

        if self.generate:
            self.save()
            self.reference = self.result

        A_ref, lbias_ref, ubias_ref, bound_out_ref = self.reference
        for i in range(3):
            for j in range(2):
                if A_ref[i][j] is not None:
                    ref = A_ref[i][j].to(device=device, dtype=dtype)
                    self.assertEqual(A[i][j], ref)
                    
        lbias_ref = lbias_ref.to(device=device, dtype=dtype)
        ubias_ref = ubias_ref.to(device=device, dtype=dtype)
        bound_out_ref = (
            bound_out_ref[0].to(device=device, dtype=dtype),
            bound_out_ref[1].to(device=device, dtype=dtype)
        )
        self.assertEqual(lbias, lbias_ref)
        self.assertEqual(ubias, ubias_ref)
        self.assertEqual(bound_out[0], bound_out_ref[0])
        self.assertEqual(bound_out[1], bound_out_ref[1])


if __name__ == '__main__':
    # Change to generate=True when genearting reference results
    testcase = TestBoundOp(generate=False)
    testcase.setUp()
    testcase.test()


================================================
FILE: tests/test_branching_heuristics.py
================================================
import sys
import torch
from types import SimpleNamespace

sys.path.insert(0, '../complete_verifier')

from heuristics.base import RandomNeuronBranching
from testcase import DEFAULT_DEVICE, DEFAULT_DTYPE, set_default_dtype_device

def test_branching_heuristics():
    device = DEFAULT_DEVICE
    dtype = DEFAULT_DTYPE
    set_default_dtype_device(dtype, device)
    import random
    import numpy as np
    seed = 123
    torch.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)

    net = SimpleNamespace()
    branching_heuristic = RandomNeuronBranching(net)

    for _ in range(10000):
        batch_size = random.randint(1, 5)
        # Number of layers, and we will split the total_layers into this
        # many of layers.
        n_layers = random.randint(1, 5)
        total_len = random.randint(n_layers, 100)
        net.split_nodes = []
        net.split_activations = {}
        for i in range(n_layers):
            layer = SimpleNamespace()
            layer.name = i
            activation = SimpleNamespace()
            activation.name = f'{i}_activation'
            net.split_nodes.append(layer)
            net.split_activations[layer.name] = [(activation, 0)]
        # Total number of neurons in all layers.
        topk = random.randint(1, total_len)
        # Generate random and unique scores.
        # scores = torch.argsort(torch.rand(batch_size, total_len)) + 1
        scores = torch.rand(batch_size, total_len) + 1e-8
        # Generate random mask. Mask = 1 means this neuron can be split.
        masks = (torch.rand(batch_size, total_len) > 0.75).float()
        # Generate random split locations.
        split_position = torch.randint(
            low=0, high=total_len, size=(n_layers - 1,)).sort().values
        print(f'testing batch={batch_size}, n_layers={n_layers}, '
              f'total_len={total_len}, topk={topk}, split={split_position}')
        segment_lengths = (torch.cat(
            [split_position, torch.full(size=(1,),
                                        fill_value=total_len,
                                        device=split_position.device)])
                           - torch.cat([torch.zeros((1,), device=split_position.device),
                                        split_position]))
        segment_lengths = segment_lengths.int().tolist()
        # Cap to the minimum number of valid neurons in each batch.
        min_k = int(masks.sum(dim=1).min().item())
        # Find the topk scores and indices across all layers.
        topk_scores, topk_indices = (scores * masks).topk(k=min(min_k, topk))
        # Map the indices to groundtruth layer number.
        topk_layers = torch.searchsorted(
            split_position, topk_indices, right=True)
        # Map the indices to groundtruth neuron number.
        topk_neurons = topk_indices - torch.cat(
            [torch.zeros(1, device=split_position.device, dtype=torch.int64),
             split_position]
        ).view(1, -1).repeat(batch_size, 1).gather(
            dim=1, index=topk_layers)
        # Split into a list of scores for testing.
        all_layer_scores = scores.split(segment_lengths, dim=1)
        all_layer_masks = masks.split(segment_lengths, dim=1)
        all_layer_scores = {i: item for i, item in enumerate(all_layer_scores)}
        all_layer_masks = {i: item for i, item in enumerate(all_layer_masks)}
        branching_heuristic.update_batch_size_and_device(all_layer_scores)
        (calculated_layers, calculated_neurons,
         calculated_scores) = branching_heuristic.find_topk_scores(
            all_layer_scores, all_layer_masks, k=topk, return_scores=True)
        torch.testing.assert_close(calculated_layers, topk_layers)
        torch.testing.assert_close(calculated_neurons, topk_neurons)
        torch.testing.assert_close(calculated_scores, topk_scores)


if __name__ == "__main__":
    test_branching_heuristics()


================================================
FILE: tests/test_clip_domains.py
================================================
"""
Tests clip_domains

To run tests: py.test             test_clip_domains.py
          or: python -m pytest    test_clip_domains.py
Verbose (-v): py.test -v          test_clip_domains.py
          or: python -m pytest -v test_clip_domains.py
"""
import torch
from torch import Tensor
from random import randint
from typing import Union, Tuple

import sys
sys.path.append('../complete_verifier')

# importing clip_domains from CROWN
from input_split.clip import clip_domains
from testcase import DEFAULT_DEVICE, DEFAULT_DTYPE, set_default_dtype_device

batches = 2 # Do not use large batch sizes when running on CI
device = DEFAULT_DEVICE  # CI is not equipped with CUDA
dtype = DEFAULT_DTYPE

set_default_dtype_device(dtype, device)

atol = 1e-4  # my references are defined at this level of tolerance

def setup_module(module):
    """
    Displays global information about the test run
    @param module:
    @return:
    """
    print()
    print("setup_module      module:%s" % module.__name__)
    print(f"Using device: {device}")
    print(f"Using dtype: {dtype}")
    print(f"Using atol: {atol}")
    print(f"Using number of batches (batch copies): {batches}")
    print()

def setup_function(function):
    """
    Adds spacing between tests
    @param function:
    @return:
    """
    print(f"\nRunning test case: {function.__name__}")

def _tensor(x):
    return torch.tensor(x, device=device, dtype=dtype)

def test_case_one_one():
    print()
    # Define the base 2D tensors
    A_bar_base = _tensor([[4 / 5, -7 / 20], [3 / 10, -3 / 7]])
    x_L_base = _tensor([-3, -2])
    x_U_base = _tensor([3, 2])
    c_bar_base = _tensor([[1 / 10], [3 / 10]])
    target_base = _tensor([[0], [0]])

    # Expand the base tensors along the batch dimension
    lA, x_L, x_U, c_bar, thresholds, dm_lb = setup_test_matrices(A_bar_base, x_L_base, x_U_base, c_bar_base, target_base,
                                                          batches)

    # In this suite, we have a reference for x_L/U
    ref_x_L = _tensor([-3., -1.4]).unsqueeze(0).expand(batches, -1)
    ref_x_U = _tensor([0.75, 2.0000]).unsqueeze(0).expand(batches, -1)

    old_x_L = x_L.clone()
    old_x_U = x_U.clone()
    ret = clip_domains(x_L, x_U, thresholds, lA, None, dm_lb)
    new_x_L, new_x_U = ret
    assert (new_x_L.shape == old_x_L.shape) and (new_x_U.shape == old_x_U.shape), "x_L(U) should have the same shape as before"

    # check the returned x_L/U matches the expected x_L/U values
    x_L_eq = torch.allclose(new_x_L, ref_x_L, atol=atol)
    x_U_eq = torch.allclose(new_x_U, ref_x_U, atol=atol)
    assert x_L_eq, "x_L is not correct"
    assert x_U_eq, "x_U is not correct"

def test_case_one_two():
    print()
    # Define the base 2D tensors
    A_bar_base = _tensor([[3 / 10, -3 / 7]])
    x_L_base = _tensor([-3, -2])
    x_U_base = _tensor([3, 2])
    c_bar_base = _tensor([[3 / 10]])
    target_base = _tensor([[0]])

    # Expand the base tensors along the batch dimension
    lA, x_L, x_U, c_bar, thresholds, dm_lb = setup_test_matrices(A_bar_base, x_L_base, x_U_base, c_bar_base, target_base,
                                                          batches)

    # In this suite, we have a reference for x_L/U
    ref_x_L = _tensor([-3., -1.4]).unsqueeze(0).expand(batches, -1)
    ref_x_U = _tensor([1.8571, 2.0000]).unsqueeze(0).expand(batches, -1)

    old_x_L = x_L.clone()
    old_x_U = x_U.clone()
    ret = clip_domains(x_L, x_U, thresholds, lA, None, dm_lb)
    new_x_L, new_x_U = ret
    assert (new_x_L.shape == old_x_L.shape) and (new_x_U.shape == old_x_U.shape), "x_L(U) should have the same shape as before"

    # check the returned x_L/U matches the expected x_L/U values
    x_L_eq = torch.allclose(new_x_L, ref_x_L, atol=atol)
    x_U_eq = torch.allclose(new_x_U, ref_x_U, atol=atol)
    assert x_L_eq, "x_L is not correct"
    assert x_U_eq, "x_U is not correct"

def test_case_one_three():
    print()
    # Define the base 2D tensors
    A_bar_base = _tensor([[3 / 10, -3 / 7], [3 / 10, -3 / 7]])
    x_L_base = _tensor([-3, -2])
    x_U_base = _tensor([3, 2])
    c_bar_base = _tensor([[3 / 10], [3 / 10]])
    target_base = _tensor([[0], [0]])

    # Expand the base tensors along the batch dimension
    lA, x_L, x_U, c_bar, thresholds, dm_lb = setup_test_matrices(A_bar_base, x_L_base, x_U_base, c_bar_base, target_base,
                                                          batches)

    # In this suite, we have a reference for x_L/U
    ref_x_L = _tensor([-3., -1.4]).unsqueeze(0).expand(batches, -1)
    ref_x_U = _tensor([1.8571, 2.0000]).unsqueeze(0).expand(batches, -1)

    old_x_L = x_L.clone()
    old_x_U = x_U.clone()
    ret = clip_domains(x_L, x_U, thresholds, lA, None, dm_lb)
    new_x_L, new_x_U = ret
    assert (new_x_L.shape == old_x_L.shape) and (new_x_U.shape == old_x_U.shape), "x_L(U) should have the same shape as before"

    # check the returned x_L/U matches the expected x_L/U values
    x_L_eq = torch.allclose(new_x_L, ref_x_L, atol=atol)
    x_U_eq = torch.allclose(new_x_U, ref_x_U, atol=atol)
    assert x_L_eq, "x_L is not correct"
    assert x_U_eq, "x_U is not correct"


def test_case_one_four():
    print()
    # Define the base 2D tensors
    A_bar_base = _tensor([[4 / 5, -7 / 20, 0.1], [3 / 10, -3 / 7, 0.1]])
    x_L_base = _tensor([-3, -2, -1])
    x_U_base = _tensor([3, 2, 1])
    c_bar_base = _tensor([[1 / 10], [3 / 10]])
    target_base = _tensor([[0], [0]])

    # Expand the base tensors along the batch dimension
    lA, x_L, x_U, c_bar, thresholds, dm_lb = setup_test_matrices(A_bar_base, x_L_base, x_U_base, c_bar_base, target_base,
                                                          batches)

    old_x_L = x_L.clone()
    old_x_U = x_U.clone()
    ret = clip_domains(x_L, x_U, thresholds, lA, None, dm_lb)
    new_x_L, new_x_U = ret
    assert (new_x_L.shape == old_x_L.shape) and (new_x_U.shape == old_x_U.shape), "x_L(U) should have the same shape as before"

def test_case_two_one():
    """
    Visualize this test case at
    https://www.desmos.com/3d/fz6e11ovm3
    @return:
    """
    print()
    # Define the base 2D tensors
    A_bar_base = _tensor([[5/5, 1/5], [2/5, 1/5], [10/35, 1/5]])
    x_L_base = _tensor([0, 0])
    x_U_base = _tensor([1, 1])
    c_bar_base = _tensor([[-1/5], [-1/5], [-1/5]])
    target_base = _tensor([[0], [0], [0]])

    # Expand the base tensors along the batch dimension
    lA, x_L, x_U, c_bar, thresholds, dm_lb = setup_test_matrices(A_bar_base, x_L_base, x_U_base, c_bar_base, target_base,
                                                          batches)

    # In this suite, we have a reference for x_L/U
    ref_x_L = _tensor([0., 0.]).unsqueeze(0).expand(batches, -1)
    ref_x_U = _tensor([0.2000, 1.0000]).unsqueeze(0).expand(batches, -1)

    old_x_L = x_L.clone()
    old_x_U = x_U.clone()
    ret = clip_domains(x_L, x_U, thresholds, lA, None, dm_lb)
    new_x_L, new_x_U = ret
    assert (new_x_L.shape == old_x_L.shape) and (new_x_U.shape == old_x_U.shape), "x_L(U) should have the same shape as before"

    # check the returned x_L/U matches the expected x_L/U values
    x_L_eq = torch.allclose(new_x_L, ref_x_L, atol=atol)
    x_U_eq = torch.allclose(new_x_U, ref_x_U, atol=atol)
    assert x_L_eq, "x_L is not correct"
    assert x_U_eq, "x_U is not correct"


def test_case_two_two():
    """
    Visualize this test case at
    https://www.desmos.com/3d/ruty3i54wu
    @return:
    """
    print()
    # Define the base 2D tensors
    A_bar_base = -1. * _tensor([[5 / 5, 1 / 5], [2 / 5, 1 / 5], [10 / 35, 1 / 5]])
    x_L_base = _tensor([0, 0])
    x_U_base = _tensor([1, 1])
    c_bar_base = -1. * _tensor([[-1 / 5], [-1 / 5], [-1 / 5]])
    target_base = _tensor([[0], [0], [0]])

    # Expand the base tensors along the batch dimension
    lA, x_L, x_U, c_bar, thresholds, dm_lb = setup_test_matrices(A_bar_base, x_L_base, x_U_base, c_bar_base, target_base,
                                                          batches)

    # In this suite, we have a reference for x_L/U
    ref_x_L = x_L.clone()
    ref_x_U = x_U.clone()

    old_x_L = x_L.clone()
    old_x_U = x_U.clone()
    ret = clip_domains(x_L, x_U, thresholds, lA, None, dm_lb)
    new_x_L, new_x_U = ret
    assert (new_x_L.shape == old_x_L.shape) and (new_x_U.shape == old_x_U.shape), "x_L(U) should have the same shape as before"

    # check the returned x_L/U matches the expected x_L/U values
    x_L_eq = torch.allclose(new_x_L, ref_x_L, atol=atol)
    x_U_eq = torch.allclose(new_x_U, ref_x_U, atol=atol)
    assert x_L_eq, "x_L is not correct"
    assert x_U_eq, "x_U is not correct"

def test_case_two_three():
    """
    Visualize this test case at
    https://www.desmos.com/3d/vogsjthmav
    @return:
    """
    print()
    # Define the base 2D tensors
    A_bar_base = _tensor([[-5 / 5, -1 / 5], [2 / 5, 1 / 5], [10 / 35, 1 / 5]])
    x_L_base = _tensor([0, 0])
    x_U_base = _tensor([1, 1])
    c_bar_base = _tensor([[1 / 5], [-1 / 5], [-1 / 5]])
    target_base = _tensor([[0], [0], [0]])

    # Expand the base tensors along the batch dimension
    lA, x_L, x_U, c_bar, thresholds, dm_lb = setup_test_matrices(A_bar_base, x_L_base, x_U_base, c_bar_base, target_base,
                                                          batches)

    # In this suite, we have a reference for x_L/U
    ref_x_L = x_L.clone()
    ref_x_U = torch.zeros_like(x_U)
    ref_x_U[:] = _tensor([0.5, 1.0])

    old_x_L = x_L.clone()
    old_x_U = x_U.clone()
    ret = clip_domains(x_L, x_U, thresholds, lA, None, dm_lb)
    new_x_L, new_x_U = ret
    assert (new_x_L.shape == old_x_L.shape) and (new_x_U.shape == old_x_U.shape), "x_L(U) should have the same shape as before"

    # check the returned x_L/U matches the expected x_L/U values
    x_L_eq = torch.allclose(new_x_L, ref_x_L, atol=atol)
    x_U_eq = torch.allclose(new_x_U, ref_x_U, atol=atol)
    assert x_L_eq, "x_L is not correct"
    assert x_U_eq, "x_U is not correct"

# Rest of file are helper functions

def concretize_bounds(
        x_hat: torch.Tensor,
        x_eps: torch.Tensor,
        lA: torch.Tensor,
        lbias: Union[torch.Tensor, int],
        C: Union[torch.Tensor, None] = None,
        lower: bool = True):
    """
    Takes batches and concretizes them
    @param x_hat: shape (batch, input_dim)                  The origin position of the input domain
    @param x_eps: shape (batch, input_dim)                  The epsilon disturbance from the origin of the input domain
    @param lA: shape (batch, spec_dim/lA rows, input_dim)   The lA matrix calculated by CROWN; When C is None, we refer
                                                            to the second dimension as spec_dim. When C is given, this
                                                            is denoted as lA rows
    @param lbias: shape (batch, spec_dim)                   The bias vector calculated by CROWN
    @param lower:                                           Whether the lower or upper bound should be concretized
    @param C: shape (batch, spec_dim, lA rows)              When not None, is transposed and distributed to lA and lbias
                                                            to produce the specification of interest
    @return:                                                The lower/upper bound of the batches
    """
    lA = lA.view(lA.shape[0], lA.shape[1], -1)
    batches, spec_dim, input_dim = lA.shape
    if isinstance(lbias, int):
        lbias = _tensor([lbias]).expand(batches, spec_dim)
    lbias = lbias.unsqueeze(-1)  # change lbiases to be column vectors
    if C is not None:
        # Let C act like the new last linear layer of the network and distribute it to lA and lbias
        # Update shapes
        C = C.reshape(batches, spec_dim, -1)
        C = C.transpose(1, 2)
        lA = C.bmm(lA)
        lbias = C.bmm(lbias)
        batches, spec_dim, input_dim = lA.shape
    # lA shape: (batch, spec_dim, # inputs)
    # dom_lb shape: (batch, spec_dim)
    # thresholds shape: (batch, spec_dim)
    # lbias shape: (batch, spec_dim, 1)

    sign = -1 if lower else 1
    x_hat = x_hat.unsqueeze(-1)
    x_eps = x_eps.unsqueeze(-1)

    ret = lA.bmm(x_hat) + sign * lA.abs().bmm(x_eps) + lbias

    return ret.squeeze(2)

def setup_test_matrices(
        A_bar_base: Tensor,
        x_L_base: Tensor,
        x_U_base: Tensor,
        l_bias_base: Tensor,
        target_base: Tensor,
        batches: int
) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]:
    """
    Creates batch copies of base Tensors and formats them in the same format that they would be in CROWN.
    @param A_bar_base: shape (spec_dim, input_dim)  The lA matrix of the instance
    @param x_L_base: shape (input_dim,)             The lower bound on the input domain
    @param x_U_base: shape (input_dim,)             The upper bound on the input domain
    @param l_bias_base: shape (spec_dim,)           The bias vector of the instance
    @param target_base: shape (spec_dim,)           The threshold/specification to verify
    @param batches:                                 The number of batch copies to produce of the instance
    @return:                                        Returns same instance in batch form
    """
    # create the copies
    lA, x_L, x_U, c_bar, thresholds = create_batch_copies(A_bar_base, x_L_base, x_U_base, l_bias_base, target_base,
                                                          batches)

    # This is how x_L, x_U, lbias will be received in CROWN
    # x_L/U shape: (batch, # inputs)
    # lA shape: (batch, spec_dim, # inputs)
    # dom_lb shape: (batch, spec_dim)
    # thresholds shape: (batch, spec_dim)
    x_L = x_L.flatten(1)
    x_U = x_U.flatten(1)
    c_bar = c_bar.squeeze(-1)
    thresholds = thresholds.squeeze(-1)

    # get the global lb
    x_hat = (x_U + x_L) / 2
    x_eps = (x_U - x_L) / 2
    dm_lb = concretize_bounds(x_hat, x_eps, lA, c_bar)

    return lA, x_L, x_U, c_bar, thresholds, dm_lb

def create_batch_copies(
        A_bar_base: Tensor,
        x_L_base: Tensor,
        x_U_base: Tensor,
        l_bias_base: Tensor,
        target_base: Tensor,
        batches: int
) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]:
    """
    Takes a problem not in batch form and turns them into batches.
    If batches = 1, we only solve the initial problem in batch form, and if batches > 1, we are solving the same
    problem but in multiple batches.
    @param A_bar_base:
    @param x_L_base:
    @param x_U_base:
    @param l_bias_base:
    @param target_base:
    @param batches:
    @return:
    """
    A_bar = A_bar_base.unsqueeze(0).repeat(batches, 1, 1)
    x_L = x_L_base.unsqueeze(0).repeat(batches, 1)
    x_U = x_U_base.unsqueeze(0).repeat(batches, 1)
    l_bias = l_bias_base.unsqueeze(0).repeat(batches, 1, 1)
    target = target_base.unsqueeze(0).repeat(batches, 1, 1)

    return A_bar, x_L, x_U, l_bias, target


def random_setup_generator(
        randint_range=(1, 10),
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, dict]:
    """
    Creates random problem set-ups to test out if our new heuristic is compatible with various dimensions
    @param randint_range:   A range where batches, spec_dim, and input_dim will exist in
    @return:
    """
    batches, spec_dim, input_dim = randint(*randint_range), randint(*randint_range), randint(*randint_range)
    lA = torch.rand((batches, spec_dim, input_dim))
    lbias = torch.rand((batches, spec_dim, 1))
    thresholds = torch.rand((batches, spec_dim, 1))
    parameters = {
        "batches": batches,
        "spec_dim": spec_dim,
        "input_dim": input_dim
    }
    return lA, lbias, thresholds, parameters


================================================
FILE: tests/test_constant.py
================================================
"""Test BoundConstant"""
import torch
import os
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from auto_LiRPA import BoundedModule, BoundedTensor
from auto_LiRPA.perturbations import *
from testcase import TestCase, DEFAULT_DEVICE, DEFAULT_DTYPE

class cnn_MNIST(nn.Module):
    def __init__(self):
        super(cnn_MNIST, self).__init__()
        self.conv1 = nn.Conv2d(1, 8, 4, stride=2, padding=1)
        self.conv2 = nn.Conv2d(8, 16, 4, stride=2, padding=1)
        self.fc1 = nn.Linear(784, 256)
        self.fc2 = nn.Linear(256, 10)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = x.view(-1, 784)
        x = 2.0 * x
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return 0.5 * x

class TestConstant(TestCase):
    
    def __init__(self, methodName='runTest', generate=False,
                 device=DEFAULT_DEVICE, dtype=DEFAULT_DTYPE):
        super().__init__(methodName,
            seed=1, ref_name='constant_test_data',
            generate=generate,
            device=device, dtype=dtype)

    def test(self):
        model = cnn_MNIST()
        checkpoint = torch.load("../examples/vision/pretrained/mnist_cnn_small.pth", map_location=self.default_device)
        model.load_state_dict(checkpoint)

        N = 2
        n_classes = 10
        image = torch.randn(N, 1, 28, 28)
        image = image.to(device=self.default_device,
                         dtype=self.default_dtype) / 255.0

        model = BoundedModule(model, torch.empty_like(image), device=self.default_device)
        eps = 0.3
        norm = np.inf
        ptb = PerturbationLpNorm(norm=norm, eps=eps)
        image = BoundedTensor(image, ptb)
        pred = model(image)
        lb, ub = model.compute_bounds()

        assert lb.shape == ub.shape == torch.Size((2, 10))

        self.result = (lb, ub)
        if self.reference:
            self.reference = (
                self.reference[0].to(
                    device=self.default_device, dtype=self.default_dtype),
                self.reference[1].to(
                    device=self.default_device, dtype=self.default_dtype)
            )

        self.rtol = 5e-4
        self.check()

if __name__ == '__main__':
    # Change to generate=True when genearting reference results
    testcase = TestConstant(generate=False)
    testcase.setUp()
    testcase.test()


================================================
FILE: tests/test_constrained_concretize.py
================================================
"""Test optimized bounds in simple_verification."""
import torch
import numpy as np
from auto_LiRPA import BoundedModule, BoundedTensor
from auto_LiRPA.perturbations import PerturbationLpNorm

from testcase import TestCase, DEFAULT_DEVICE, DEFAULT_DTYPE

class ConstrainedConcretizeModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.w1 = torch.tensor([[1., -1.], [2., -1.]])
        self.w2 = torch.tensor([[1., -1.]])

    def forward(self, x):
        z1 = x.matmul(self.w1.t())
        hz1 = torch.nn.functional.relu(z1)
        z2 = hz1.matmul(self.w2.t())
        return z2

class TestConstrainedConcretize(TestCase):
    def __init__(self, methodName='runTest', generate=False, device=DEFAULT_DEVICE, dtype=DEFAULT_DTYPE):
        super().__init__(methodName, 1, "test_constrained_concretize", generate, device=device, dtype=dtype)

    def test(self):
        model = ConstrainedConcretizeModel().to(self.default_device).to(self.default_dtype)
        # Input x.
        x = torch.tensor([[1., 1.]], dtype=self.default_dtype, device=self.default_device)
        # Lower and upper bounds of x.
        lower = torch.tensor([[-1., -2.]], dtype=self.default_dtype, device=self.default_device)
        upper = torch.tensor([[2., 1.]], dtype=self.default_dtype, device=self.default_device)

        # Wrap model with auto_LiRPA for bound computation.
        # The second parameter is for constructing the trace of the computational graph,
        # and its content is not important.

        lirpa_model = BoundedModule(model, torch.empty_like(x))
        pred = lirpa_model(x)
        print(f'Model prediction: {pred.item()}')

        # Compute bounds using LiRPA using the given lower and upper bounds.
        norm = float("inf")
        ptb = PerturbationLpNorm(norm = norm, x_L=lower, x_U=upper)
        bounded_x = BoundedTensor(x, ptb)

        # Compute bounds.
        lb, ub = lirpa_model.compute_bounds(x=(bounded_x,), method='CROWN')
        print(f'CROWN bounds: lower={lb.item()}, upper={ub.item()}')

        # Add a new constraint of :
        #    1*x_0 + 1*x_1 + 2 <= 0
        constraint_a = torch.tensor([[[1.0, 1.0]]], dtype=self.default_dtype, device=self.default_device)
        constraint_b = torch.tensor([[2.0]], dtype=self.default_dtype, device=self.default_device)
        constraints = (constraint_a, constraint_b)

        norm = float("inf")
        ptb = PerturbationLpNorm(norm = norm, x_L=lower, x_U=upper, constraints=constraints)
        bounded_x = BoundedTensor(x, ptb)
        # Compute bounds.
        constrained_lb, constrained_ub = lirpa_model.compute_bounds(x=(bounded_x,), method='CROWN')
        print(f'CROWN bounds (with constraints): lower={constrained_lb.item()}, upper={constrained_ub.item()}')

        self.result = (lb, ub, constrained_lb, constrained_ub)
        self.check()

if __name__ == '__main__':
    testcase = TestConstrainedConcretize(generate=True)
    testcase.setUp()
    testcase.test()

================================================
FILE: tests/test_conv.py
================================================
import torch
import torch.nn as nn
from auto_LiRPA import BoundedModule, BoundedTensor
from auto_LiRPA.perturbations import *
from testcase import TestCase, DEFAULT_DEVICE, DEFAULT_DTYPE

class cnn_model(nn.Module):
    def __init__(self, layers, padding, stride, linear):
        super(cnn_model, self).__init__()
        self.module_list = []
        channel = 1
        length = 28
        for i in range(layers):
            self.module_list.append(nn.Conv2d(channel, 3, 4, stride = stride, padding = padding))
            channel = 3
            length = (length + 2 * padding - 4)//stride + 1
            assert length > 0
            self.module_list.append(nn.ReLU())
        self.module_list.append(nn.Flatten())
        if linear:
            self.module_list.append(nn.Linear(3 * length * length, 256))
            self.module_list.append(nn.Linear(256, 10))
        self.model = nn.Sequential(*self.module_list)

    def forward(self, x):
        x = self.model(x)
        return x

class TestConv(TestCase):
    def __init__(self, methodName='runTest', generate=False,
                 device=DEFAULT_DEVICE, dtype=DEFAULT_DTYPE):
        super().__init__(methodName,
            seed=1, ref_name=None,
            generate=generate,
            device=device, dtype=dtype)

    def test(self):
        models = [1, 2, 3]
        paddings = [1, 2]
        strides = [1, 3]

        N = 2
        n_classes = 10
        image = torch.randn(N, 1, 28, 28, dtype=self.default_dtype, device=self.default_device)
        image = image / 255.0

        for layer_num in models:
            for padding in paddings:
                for stride in strides:
                    for linear in [True, False]:
                        model_ori = cnn_model(layer_num, padding, stride, linear)
                        print('Model:', model_ori)
                        model_ori = model_ori.to(
                            device=self.default_device, dtype=self.default_dtype)

                        model = BoundedModule(model_ori, image, device=self.default_device, bound_opts={"conv_mode": "patches"})
                        eps = 0.3
                        ptb = PerturbationLpNorm(x_L=image-eps, x_U=image+eps)
                        image = BoundedTensor(image, ptb)
                        pred = model(image)
                        lb, ub = model.compute_bounds()

                        model = BoundedModule(model_ori, image, device=self.default_device, bound_opts={"conv_mode": "matrix"})
                        pred = model(image)
                        lb_ref, ub_ref = model.compute_bounds()

                        if linear:
                            assert lb.shape == ub.shape == torch.Size((N, n_classes))
                        self.assertEqual(lb, lb_ref)
                        self.assertEqual(ub, ub_ref)

                        if not linear and layer_num == 1:
                            pred = model(image)
                            lb_forward, ub_forward = model.compute_bounds(method='forward')
                            self.assertEqual(lb, lb_forward)
                            self.assertEqual(ub, ub_forward)
                            pred = model(image)
                            lb_forward, ub_forward = model.compute_bounds(method='dynamic-forward+backward')
                            self.assertEqual(lb, lb_forward)
                            self.assertEqual(ub, ub_forward)

if __name__ == '__main__':
    testcase = TestConv()
    testcase.test()


================================================
FILE: tests/test_conv1d.py
================================================
"""Test Conv1d."""

import torch
import torch.nn as nn
import torch.nn.functional as F

from auto_LiRPA import BoundedModule, BoundedTensor
from auto_LiRPA.perturbations import *
from testcase import TestCase, DEFAULT_DEVICE, DEFAULT_DTYPE


class Model(nn.Module):
    def __init__(self, kernel_size=2, stride=1, padding=0, in_features=1,out_features=1):
        super(Model, self).__init__()        
        self.n_n_conv1d_1 = nn.Conv1d(**{'groups': 1, 'dilation': 1, 'out_channels': 1, 'padding': padding, 'kernel_size': kernel_size, 'stride': stride, 'in_channels': 1, 'bias': True})
        self.n_n_conv1d_2 = nn.Conv1d(**{'groups': 1, 'dilation': 1, 'out_channels': 1, 'padding': padding, 'kernel_size': kernel_size, 'stride': stride, 'in_channels': 1, 'bias': True})
        self.relu_2 = nn.ReLU()
        self.n_n_conv1d_3 = nn.Conv1d(**{'groups': 1, 'dilation': 1, 'out_channels': 1, 'padding': padding, 'kernel_size': kernel_size, 'stride': stride, 'in_channels': 1, 'bias': True})
        self.relu_3 = nn.ReLU()
        self.n_n_activation_Flatten = nn.Flatten(**{'start_dim': 1})
        L_in,dialation = in_features,1
        L_out_1 = math.floor((L_in+2*padding-dialation*(kernel_size-1)-1)/stride+1)
        L_out_2 = math.floor((L_out_1+2*padding-dialation*(kernel_size-1)-1)/stride+1)
        L_out_3 = math.floor((L_out_2+2*padding-dialation*(kernel_size-1)-1)/stride+1)
        self.n_n_linear = nn.Linear(**{'in_features':L_out_3, 'out_features':out_features,'bias':True})

    def forward(self, *inputs,debug=False):
        t_ImageInputLayer, = inputs
        t_conv1d_1 = self.n_n_conv1d_1(t_ImageInputLayer)
        if debug: print("t_ImageInputLayer",t_ImageInputLayer.shape)
        if debug: print("t_conv1d_1",t_conv1d_1.shape)
        t_conv1d_relu_1 = F.relu(t_conv1d_1)
        t_conv1d_2 = self.n_n_conv1d_2(t_conv1d_relu_1)
        if debug: print("t_conv1d_2",t_conv1d_2.shape)
        t_conv1d_relu_2 = F.relu(t_conv1d_2)
        t_conv1d_3 = self.n_n_conv1d_3(t_conv1d_relu_2)
        if debug: print("t_conv1d_3",t_conv1d_3.shape)
        t_conv1d_relu_3 = F.relu(t_conv1d_3)
        t_flatten = self.n_n_activation_Flatten(t_conv1d_relu_3)
        if debug: print("t_flatten",t_flatten.shape)
        t_linear = self.n_n_linear(t_flatten)        
        if debug: print("t_linear",t_linear.shape)
        return t_linear

class TestConv1D(TestCase):
    def __init__(self, methodName='runTest', generate=False,
                 device=DEFAULT_DEVICE, dtype=DEFAULT_DTYPE):
        super().__init__(methodName,
            seed=1, ref_name=None,
            generate=generate,
            device=device, dtype=dtype)

    def test(self):
        if self.default_dtype == torch.float64:
            data_path = 'data_64/'
        else:
            data_path = 'data/'

        N = 3
        C = 1
        M = 173
        n_classes = 2
        for kernel_size in [3,4]:
            for padding in [0,1]:
                for stride in [2,3]:
                    print(kernel_size, padding, stride)

                    model_ori = Model(kernel_size=kernel_size, padding=padding, stride=stride, in_features=M,out_features=n_classes)
                    model_ori = model_ori.to(dtype=self.default_dtype, device=self.default_device)
                    if not self.generate:
                        data = torch.load(data_path + 'conv1d_test_data_{}-{}-{}'.format(kernel_size, padding, stride), weights_only=False)
                        image = data['input'].to(dtype=self.default_dtype, device=self.default_device)
                        model_ori(image)
                        model_ori.load_state_dict(data['model'])
                    else:
                        image = torch.rand([N, C, M], dtype=self.default_dtype, device=self.default_device)
                        model_ori(image)


                    conv_mode = "matrix"

                    model = BoundedModule(model_ori, image, device=self.default_device, bound_opts={"conv_mode": conv_mode})
                    eps = 0.3
                    norm = np.inf
                    ptb = PerturbationLpNorm(norm=norm, eps=eps)
                    image = BoundedTensor(image, ptb)
                    lb, ub, A = model.compute_bounds((image,), return_A=True, needed_A_dict={model.output_name[0]:model.input_name[0]},)
                    '''
                    # 1. testing if lb == ub == pred when eps = 0
                    assert (lb == ub).all() and torch.allclose(lb,pred,rtol=1e-5) and torch.allclose(ub,pred,rtol=1e-5)
                    # 2. test if A matrix equals to gradient of the input
                    # get output's grad with respect to the input without iterating through torch.autograd.grad:
                    # https://stackoverflow.com/questions/64988010/getting-the-outputs-grad-with-respect-to-the-input
                    uA = A[model.output_name[0]][model.input_name[0]]['uA']
                    lA = A[model.output_name[0]][model.input_name[0]]['lA']
                    assert (uA==lA).all()
                    assert (torch.autograd.functional.jacobian(model_ori,image_clean).sum(dim=2)==uA).all()
                    assert (torch.autograd.functional.jacobian(model_ori,image_clean).sum(dim=2)==lA).all()
                    # double check
                    input_grads = torch.zeros(uA.shape)
                    for i in range(N):
                        for j in range(n_classes):
                            input_grads[i][j]=torch.autograd.grad(outputs=output_clean[i,j], inputs=image_clean, retain_graph=True)[0].sum(dim=0)
                    assert (input_grads==uA).all()
                    assert (input_grads==lA).all()
                    '''
                    # 3. test when eps = 0.3 (uncommented)
                    if self.generate:
                        torch.save(
                            {'model': model_ori.state_dict(),
                            'input': image,
                            'lb': lb,
                            'ub': ub}, data_path + '/conv1d_test_data_{}-{}-{}'.format(kernel_size, padding, stride)
                        )

                    if not self.generate:
                        lb_ref = data['lb']
                        ub_ref = data['ub']
                        assert torch.allclose(lb, lb_ref, 1e-3)
                        assert torch.allclose(ub, ub_ref, 1e-3)


if __name__ == '__main__':
    testcase = TestConv1D(generate=False)
    testcase.test()


================================================
FILE: tests/test_distinct_patches.py
================================================
import torch
import random
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from auto_LiRPA import BoundedModule, BoundedTensor
from auto_LiRPA.perturbations import PerturbationLpNorm
import sys
sys.path.append('../examples/vision')
from testcase import TestCase, DEFAULT_DEVICE, DEFAULT_DTYPE


def reset_seed(seed=1234):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    random.seed(seed)
    np.random.seed(seed)


class cnn_4layer_b(nn.Module):
    def __init__(self, paddingA, paddingB):
        super().__init__()
        self.padA = nn.ZeroPad2d(paddingA)
        self.padB = nn.ZeroPad2d(paddingB)

        self.conv1 = nn.Conv2d(3, 32, (5,5), stride=2, padding=0)
        self.conv2 = nn.Conv2d(32, 128, (4,4), stride=2, padding=1)

        self.linear = None
        self.fc = nn.Linear(250, 10)

    def forward(self, x):
        x = self.padA(x)
        x = self.conv1(x)
        x = self.conv2(self.padB(F.relu(x)))
        x = F.relu(x)
        x = x.view(x.size(0), -1)
        if self.linear is None:
            self.linear = nn.Linear(x.size(1), 250)
        x = self.linear(x)
        return self.fc(F.relu(x))


class TestDistinctPatches(TestCase):
    def __init__(self, methodName='runTest', generate=False,
                 device=DEFAULT_DEVICE, dtype=DEFAULT_DTYPE):        
        super().__init__(methodName,
            seed=1234, ref_name='distinct_patches_test_data',
            generate=generate,
            device=device, dtype=dtype)

        self.cases = [(2,1,2,1), (0,0,0,0), (1,3,3,1), (2,2,3,1)]

        normalize = torchvision.transforms.Normalize(
            mean = [0.4914, 0.4822, 0.4465],
            std = [0.2023, 0.1994, 0.2010]
        )
        test_data = torchvision.datasets.CIFAR10(
            "./data", train=False, download=True,
            transform=torchvision.transforms.Compose([
                torchvision.transforms.ToTensor(),
                normalize
            ])
        )
        imgs = torch.from_numpy(test_data.data[:1]).reshape(1,3,32,32).float() / 255.0
        self.single_img = imgs.to(dtype=self.default_dtype, device=self.default_device)

    def run_conv_mode(self, model, img, conv_mode):
        model(img)  # dummy run to initialize shapes
        model_lirpa = BoundedModule(
            model, img, device=self.default_device,
            bound_opts={"conv_mode": conv_mode}
        )
        ptb = PerturbationLpNorm(norm = np.inf, eps = 0.03)
        img_perturbed = BoundedTensor(img, ptb)

        lb, ub = model_lirpa.compute_bounds(
            x=(img_perturbed,), IBP=False, C=None, method='backward'
        )
        return lb, ub

    def test(self):
        self.result = []
        for paddingA in self.cases:
            for paddingB in self.cases:
                print("Testing", paddingA, paddingB)
                reset_seed()
                model_ori = cnn_4layer_b(paddingA, paddingB).to(
                    device=self.default_device, dtype=self.default_dtype
                )

                lb_patch, ub_patch = self.run_conv_mode(
                    model_ori, self.single_img, conv_mode='patches'
                )
                self.result.append((lb_patch, ub_patch))

                if self.generate:
                    # We only compare with matrix mode when generating reference results
                    lb_matrix, ub_matrix = self.run_conv_mode(
                        model_ori, self.single_img, conv_mode='matrix'
                    )
                    # Check equality
                    assert torch.allclose(lb_patch, lb_matrix), "Lower bounds differ!"
                    assert torch.allclose(ub_patch, ub_matrix), "Upper bounds differ!"
        
        self.check()


if __name__ == '__main__':
    # Change to generate=True when genearting reference results
    testcase = TestDistinctPatches(generate=False)
    testcase.test()

================================================
FILE: tests/test_examples.py
================================================
"""Test all the examples before release.

This script is expected be manually run and is not used in automatic tests."""

import pytest
import subprocess
import os
import sys
import shlex


pytest_skip = pytest.mark.skip(
    reason="It should be tested on a GPU server and excluded from CI")

if not 'CACHE_DIR' in os.environ:
    cache_dir = os.path.join(os.getcwd(), '.cache')
else:
    cache_dir = os.environ['CACHE_DIR']
if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)

def download_data_language():
    url = "http://download.huan-zhang.com/datasets/language/data_language.tar.gz"
    if not os.path.exists('../examples/language/data/sst'):
        subprocess.run(shlex.split(f"wget {url}"), cwd="../examples/language")
        subprocess.run(shlex.split(f"tar xvf data_language.tar.gz"),
            cwd="../examples/language")

@pytest_skip
def test_transformer():
    cmd = f"""python train.py --dir {cache_dir} --robust
        --method IBP+backward_train --train --num_epochs 2 --num_epochs_all_nodes 2
        --eps_start 2 --eps_length 1 --eps 0.1"""
    print(cmd, file=sys.stderr)
    download_data_language()
    subprocess.run(shlex.split(cmd), cwd='../examples/language')

@pytest_skip
def test_lstm():
    cmd = f"""python train.py --dir {cache_dir}
        --model lstm --lr 1e-3 --dropout 0.5 --robust
        --method IBP+backward_train --train --num_epochs 2 --num_epochs_all_nodes 2
        --eps_start 2 --eps_length 1 --eps 0.1
        --hidden_size 2 --embedding_size 2 --intermediate_size 2 --max_sent_length 4"""
    print(cmd, file=sys.stderr)
    download_data_language()
    subprocess.run(shlex.split(cmd), cwd='../examples/language')

@pytest_skip
def test_lstm_seq():
    cmd = f"""python train.py --dir {cache_dir}
        --hidden_size 2 --num_epochs 2 --num_slices 4"""
    print(cmd, file=sys.stderr)
    subprocess.run(shlex.split(cmd), cwd='../examples/sequence')

@pytest_skip
def test_simple_verification():
    cmd = "python simple_verification.py"
    print(cmd, file=sys.stderr)
    subprocess.run(shlex.split(cmd), cwd='../examples/vision')

@pytest_skip
def test_custom_op():
    cmd = "python custom_op.py"
    print(cmd, file=sys.stderr)
    subprocess.run(shlex.split(cmd), cwd='../examples/vision')

@pytest_skip
def test_efficient_convolution():
    cmd = "python efficient_convolution.py"
    print(cmd, file=sys.stderr)
    subprocess.run(shlex.split(cmd), cwd='../examples/vision')

@pytest_skip
def test_two_node():
    cmd = "python verify_two_node.py"
    print(cmd, file=sys.stderr)
    subprocess.run(shlex.split(cmd), cwd='../examples/vision')

@pytest_skip
def test_simple_training():
    cmd = """python simple_training.py
        --num_epochs 5 --scheduler_opts start=2,length=2"""
    print(cmd, file=sys.stderr)
    subprocess.run(shlex.split(cmd), cwd='../examples/vision')

@pytest_skip
def test_cifar_training():
    cmd = """python cifar_training.py
        --batch_size 64 --model ResNeXt_cifar
        --num_epochs 5 --scheduler_opts start=2,length=2"""
    print(cmd, file=sys.stderr)
    subprocess.run(shlex.split(cmd), cwd='../examples/vision')

@pytest_skip
def test_weight_perturbation():
    cmd = """python weight_perturbation_training.py
        --norm 2 --bound_type CROWN-IBP
        --num_epochs 3 --scheduler_opts start=2,length=1 --eps 0.01"""
    print(cmd, file=sys.stderr)
    subprocess.run(shlex.split(cmd), cwd='../examples/vision')

@pytest_skip
def test_tinyimagenet():
    cmd = f"""python tinyimagenet_training.py
        --batch_size 32 --model wide_resnet_imagenet64
        --num_epochs 3 --scheduler_opts start=2,length=1 --eps {0.1/255}
        --in_planes 2 --widen_factor 2"""
    print(cmd, file=sys.stderr)
    if not os.path.exists('../examples/vision/data/tinyImageNet/tiny-imagenet-200'):
        subprocess.run(shlex.split("bash tinyimagenet_download.sh"),
        cwd="../examples/vision/data/tinyImageNet")
    subprocess.run(shlex.split(cmd), cwd='../examples/vision')

@pytest_skip
def test_imagenet():
    cmd = f"""python imagenet_training.py
        --batch_size 32 --model wide_resnet_imagenet64_1000class
        --num_epochs 3 --scheduler_opts start=2,length=1 --eps {0.1/255}
        --in_planes 2 --widen_factor 2"""
    print(cmd)
    if (not os.path.exists('../examples/vision/data/ImageNet64/train') or
            not os.path.exists('../examples/vision/data/ImageNet64/test')):
        print('Error: ImageNet64 dataset is not ready.')
        return -1
    subprocess.run(shlex.split(cmd), cwd='../examples/vision')


def test_release():
    """Run all tests that don't require a GPU server."""
    test_simple_verification()
    test_custom_op()
    test_efficient_convolution()
    test_two_node()

if __name__ == '__main__':
    test_release()


================================================
FILE: tests/test_examples_ci.py
================================================
import subprocess
import traceback

import test_examples

original_subprocess_run = subprocess.run


def custom_run(*args, **kwargs):
    kwargs.setdefault('check', True)
    return original_subprocess_run(*args, **kwargs)


subprocess.run = custom_run


def run_tests():
    # get all func start with test in test_examples other than 'test_release'
    # and 'test_cifar_training'(cannot run on GPU with memory lower than 32GB)
    test_functions = [
        getattr(test_examples, func) for func in dir(test_examples)
        if callable(getattr(test_examples, func)) and func.startswith('test')
        and func not in ['test_release']
    ]

    try:
        for test_func in test_functions:
            test_func()
            print(f"{test_func.__name__} executed successfully.")
    except Exception as e:
        print(f"Exception in {test_func.__name__}: {e}")
        traceback.print_exc()  # Print detailed exception information

        print("Examples Test Result:")
        print("\nFailed tests:")
        print(test_func.__name__)
        raise

    print("Examples Test Result:")
    print("\nAll tests passed successfully.")


if __name__ == '__main__':
    run_tests()


================================================
FILE: tests/test_general_nonlinear.py
================================================
import sys
import pytest
import torch.nn as nn

sys.path.insert(0, '../complete_verifier')

import arguments
from beta_CROWN_solver import LiRPANet
from bab import general_bab

from auto_LiRPA import BoundedTensor
from auto_LiRPA.perturbations import *
from testcase import DEFAULT_DEVICE, DEFAULT_DTYPE


class Sin(nn.Module):
    def forward(self, x):
        return torch.sin(x)


def cifar_model_wide():
    # cifar wide
    model = nn.Sequential(
        nn.Conv2d(3, 16, 4, stride=2, padding=1),
        Sin(),
        nn.Conv2d(16, 32, 4, stride=2, padding=1),
        Sin(),
        nn.Flatten(),
        nn.Linear(32 * 8 * 8, 100),
        Sin(),
        nn.Linear(100, 10)
    )
    return model


def bab(model_ori, data, target, norm, eps, data_max=None, data_min=None, device=DEFAULT_DEVICE, dtype=DEFAULT_DTYPE):
    data = data.to(device=device, dtype=dtype)
    eps = eps.to(device=device, dtype=dtype)
    if norm == np.inf:
        if data_max is None:
            data_ub = data + eps
            data_lb = data - eps
        else:
            data_max = data_max.to(device=device, dtype=dtype)
            data_min = data_min.to(device=device, dtype=dtype)
            data_ub = torch.min(data + eps, data_max)
            data_lb = torch.max(data - eps, data_min)
    else:
        data_ub = data_lb = data
    pred = torch.argmax(model_ori(data), dim=1)

    c = torch.zeros((1, 1, 10), device=device, dtype=dtype) # we only support c with shape of (1, 1, n)
    c[0, 0, pred] = 1
    c[0, 0, target] = -1
    rhs = torch.tensor(arguments.Config["bab"]["decision_thresh"], dtype=dtype, device=device).view(c.shape[:2])

    arguments.Config.parse_config(args={})

    arguments.Config['general']['device'] = 'cpu'
    arguments.Config["solver"]["batch_size"] = 200
    arguments.Config["bab"]["decision_thresh"] = np.float64(10)  # naive float obj has no max() function, np.inf will lead infeasible domain
    arguments.Config["solver"]["beta-crown"]["iteration"] = 20
    arguments.Config["bab"]["timeout"] = 60 #300

    arguments.Config["solver"]["alpha-crown"]["lr_alpha"] = 0.1
    arguments.Config["solver"]["beta-crown"]["lr_beta"] = 0.1
    arguments.Config["bab"]["branching"]["method"] = 'nonlinear'
    arguments.Config["bab"]["branching"]["candidates"] = 2
    arguments.Config["general"]["enable_incomplete_verification"] = False
    arguments.Config["data"]["dataset"] = 'cifar'

    # LiRPA wrapper
    model = LiRPANet(model_ori, device=device, in_size=(1, 3, 32, 32))

    ptb = PerturbationLpNorm(norm=norm, eps=eps, x_L=data_lb, x_U=data_ub)
    x = BoundedTensor(data, ptb)
    forward = model_ori(x)

    min_lb = general_bab(model, x, c, rhs)[0]

    if isinstance(min_lb, torch.Tensor):
        min_lb = min_lb.item()

    min_lb += arguments.Config["bab"]["decision_thresh"]
    print(min_lb)

    assert min_lb < torch.min(forward)

# This test takes long time so it is set as the last test case.
@pytest.mark.skip(reason="The test is failing now after removing index clamping.")
# @pytest.mark.order(-1)
def test(device=DEFAULT_DEVICE, dtype=DEFAULT_DTYPE):
    model_ori = cifar_model_wide()
    data = torch.load('data/beta_crown_test_data')
    model_ori.load_state_dict(data['state_dict'])
    model_ori = model_ori.to(device=device, dtype=dtype)
    x = data['x']
    pidx = data['pidx']
    eps_temp = data['eps_temp']
    data_max = data['data_max']
    data_min = data['data_min']

    bab(model_ori, x, pidx, float('inf'), eps_temp, data_max=data_max, data_min=data_min, device=device, dtype=dtype)


if __name__ == "__main__":
    test()


================================================
FILE: tests/test_general_shape.py
================================================
""" Test inputs of general shapes (especially for matmul)"""
import torch
import torch.nn as nn
import numpy as np

from auto_LiRPA import BoundedModule, BoundedTensor, PerturbationLpNorm
from auto_LiRPA.operators import BoundMatMul
from testcase import TestCase, DEFAULT_DEVICE, DEFAULT_DTYPE

BATCH_SIZE = 2

class GeneralShapeModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.weight_1 = nn.Parameter(torch.randn(3, 4))
        self.weight_2 = nn.Parameter(torch.randn(4, 3))
        self.weight_3 = nn.Parameter(torch.randn(3, 4))
        self.weight_4 = nn.Parameter(torch.randn(4, 4, 3))
        self.weight_5 = nn.Parameter(torch.randn(6, 3, 4))
        self.weight_6 = nn.Parameter(torch.randn(3, 5))
        self.relu = nn.ReLU()
        
    def forward(self, x, w):
        # Basic MatMul (B, 3) @ (3, 4) -> (B, 4)
        y1 = x.matmul(self.weight_1)

        # BoundUnsqueeze and BoundTile
        y2 = self.relu(y1)
        y2 = y2.unsqueeze(1).repeat(1, 5, 1)   # (B, 5, 4)
        y2 = y2.matmul(self.weight_2)   # (B, 5, 4) @ (4, 3) -> (B, 5, 3)

        # More dimensions on x
        y3 = self.relu(y2)
        y3 = y3.unsqueeze(1).repeat(1, 4, 1, 1)     # (B, 4, 5, 3)
        y3 = y3.matmul(self.weight_3)   # (B, 4, 5, 3) @ (3, 4) -> (B, 4, 5, 4)

        # More dimensions on weight
        y4 = self.relu(y3)
        y4 = y4.matmul(self.weight_4)   # (B, 4, 5, 4) @ (4, 4, 3) -> (B, 4, 5, 3)

        # Automatically broadcast x
        y5 = self.relu(y4)
        y5 = y5.unsqueeze(2)   # (B, 4, 1, 5, 3)
        y5 = y5.matmul(self.weight_5)   # (B, 4, 1, 5, 3) @ (6, 3, 4) -> (B, 4, 6, 5, 4)

        # Multiply with a weight with batch dimension
        y6 = self.relu(y5)
        y6 = y6.matmul(w)   # (B, 4, 6, 5, 4) @ (B, 4, 6, 4, 3) -> (B, 4, 6, 5, 3)

        # Swap x and weight
        y7 = self.relu(y6)
        y7 = self.weight_6.matmul(y7)   # (3, 5) @ (B, 4, 6, 5, 3) -> (B, 4, 6, 3, 3)

        return y7

class TestGeneralShape(TestCase):
    def __init__(self, methodName='runTest', seed=1, generate=False,
                 device=DEFAULT_DEVICE, dtype=DEFAULT_DTYPE):
        super().__init__(methodName, seed, 'test_general_shape_data', generate, device=device, dtype=dtype)
        self.rtol = 1e-4

    def test(self):
        model = GeneralShapeModel().to(device=self.default_device, dtype=self.default_dtype)
        input = torch.randn(
            (BATCH_SIZE, 3), device=self.default_device, dtype=self.default_dtype)
        eps = 100
        ptb = PerturbationLpNorm(norm=np.inf, eps=eps)
        x = BoundedTensor(input, ptb)
        # w is an unperturbed input, but still have batch dimension
        w = torch.randn((BATCH_SIZE, 4, 6, 4, 3),
                        device=self.default_device, dtype=self.default_dtype)
        lirpa_model = BoundedModule(model, (x, w), device=self.default_device)

        lb, ub = lirpa_model.compute_bounds((x, w), method="backward")

        # # Test by sampling
        # sample_ptb = torch.rand(BATCH_SIZE, *input.shape[1:]) * 2 * eps - eps
        # sample_inputs = input[0] + sample_ptb
        # sample_output = model(sample_inputs, w)
        # assert (sample_output <= ub).all()
        # assert (sample_output >= lb).all()

        self.result = []
        for node in lirpa_model.nodes():
            if type(node) == BoundMatMul:
                self.result.append((node.lower, node.upper))
        self.result.append((lb, ub))

        self.check()

if __name__ == '__main__':
    testcase = TestGeneralShape(generate=False)
    testcase.setUp()
    testcase.test()

================================================
FILE: tests/test_identity.py
================================================
"""Test a model with an nn.Identity layer only"""
import torch
import torch.nn as nn
from auto_LiRPA import BoundedModule, BoundedTensor
from auto_LiRPA.perturbations import *
from testcase import TestCase, DEFAULT_DEVICE, DEFAULT_DTYPE 

class TestIdentity(TestCase):
    def __init__(self, methodName='runTest', device=DEFAULT_DEVICE, dtype=DEFAULT_DTYPE):
        super().__init__(methodName, device=device, dtype=dtype)

    def test(self):
        model = nn.Sequential(nn.Identity())
        x = torch.randn(2, 10, device=self.default_device,
                        dtype=self.default_dtype)
        y = model(x)
        eps = 0.1
        ptb = PerturbationLpNorm(norm=np.inf, eps=eps)
        x = BoundedTensor(x, ptb)
        model = BoundedModule(model, x, device=self.default_device)
        y_l, y_u = model.compute_bounds()
        self.assertEqual(torch.Tensor(x), y)
        self.assertEqual(y_l, x - eps)
        self.assertEqual(y_u, x + eps)


if __name__ == '__main__':
    testcase = TestIdentity()
    testcase.test()


================================================
FILE: tests/test_invprop.py
================================================
"""Test INVPROP."""
import sys
sys.path.append('../complete_verifier')
from complete_verifier.load_model import unzip_and_optimize_onnx
import torch
import torch.nn as nn
from auto_LiRPA import BoundedModule, BoundedTensor
from auto_LiRPA.perturbations import *
from testcase import TestCase, DEFAULT_DEVICE, DEFAULT_DTYPE


class SimpleExampleModel(nn.Module):
    def __init__(self):
        super().__init__()
        # Weights of linear layers.
        self.w1 = torch.tensor([[1., -1.], [2., -1.]])
        self.w2 = torch.tensor([[1., -1.]])

    def forward(self, x):
        # Linear layer.
        z1 = x.matmul(self.w1.t())
        # Relu layer.
        hz1 = torch.nn.functional.relu(z1)
        # Linear layer.
        z2 = hz1.matmul(self.w2.t())
        return z2

class TestInvpropSimpleExample(TestCase):
    def __init__(self, methodName='runTest', generate=False, device=DEFAULT_DEVICE, dtype=DEFAULT_DTYPE):
        super().__init__(methodName,
            seed=1, ref_name=None,
                         generate=generate, device=device, dtype=dtype)

    def test(self):
        np.random.seed(123)

        model_ori = SimpleExampleModel().to(
            device=self.default_device, dtype=self.default_dtype)

        apply_output_constraints_to = ['BoundMatMul', 'BoundInput']
        x = torch.tensor([[1., 1.]], device=self.default_device,
                         dtype=self.default_dtype)
        model = BoundedModule(model_ori, torch.empty_like(x), bound_opts={
            'optimize_bound_args': {
                'apply_output_constraints_to': apply_output_constraints_to,
                'tighten_input_bounds': True,
                'best_of_oc_and_no_oc': False,
                'directly_optimize': [],
                'oc_lr': 0.1,
                'share_gammas': False,
                'iteration': 1000,
            }
        },
            device=self.default_device
        )
        model.constraints = torch.ones(
            1, 1, 1, device=self.default_device, dtype=self.default_dtype)
        model.thresholds = torch.tensor(
            [-1.], device=self.default_device, dtype=self.default_dtype)

        norm = float("inf")
        lower = torch.tensor(
            [[-1., -2.]], device=self.default_device, dtype=self.default_dtype)
        upper = torch.tensor(
            [[2., 1.]], device=self.default_device, dtype=self.default_dtype)
        ptb = PerturbationLpNorm(norm = norm, x_L=lower, x_U=upper)
        bounded_x = BoundedTensor(x, ptb)

        lb, ub = model.compute_bounds(x=(bounded_x,), method='alpha-CROWN')

        if '/0' in model._modules:
            tightened_ptb = model['/0'].perturbation
        else:
            tightened_ptb = model['/x'].perturbation

        if self.default_dtype == torch.float64:
            data_path = 'data_64/'
        else:
            data_path = 'data/'

        if self.generate:
            torch.save({
                'lb': lb,
                'ub': ub,
                'x_L': tightened_ptb.x_L,
                'x_U': tightened_ptb.x_U
            }, data_path + 'invprop/simple_reference')
        else:
            data = torch.load(data_path + 'invprop/simple_reference')
            lb_ref = data['lb']
            ub_ref = data['ub']
            x_L_ref = data['x_L']
            x_U_ref = data['x_U']

            assert torch.allclose(lb, lb_ref, 1e-4)
            assert torch.allclose(ub, ub_ref, 1e-4)
            assert torch.allclose(tightened_ptb.x_L, x_L_ref, 1e-4)
            assert torch.allclose(tightened_ptb.x_U, x_U_ref, 1e-4)

class TestInvpropOODExample(TestCase):
    # Based on https://github.com/kothasuhas/verify-input/tree/main/examples/ood
    def __init__(self, methodName='runTest', generate=False, device=DEFAULT_DEVICE, dtype=DEFAULT_DTYPE):
        super().__init__(methodName, seed=1, ref_name=None,
                         generate=generate, device=device, dtype=dtype)

    def test(self):
        np.random.seed(123)

        import onnx2pytorch

        model_ori = onnx2pytorch.ConvertModel(unzip_and_optimize_onnx('data/invprop/ood.onnx')).eval()
        model_ori = model_ori.to(
            device=self.default_device, dtype=self.default_dtype)

        x = torch.tensor([[1., 1.]], device=self.default_device,
                         dtype=self.default_dtype)
        model = BoundedModule(model_ori, torch.empty_like(x), bound_opts={
            'optimize_bound_args': {
                'apply_output_constraints_to': ['BoundInput', "/input", "/input-3", "/21"],
                'tighten_input_bounds': True,
                'best_of_oc_and_no_oc': True,
                'directly_optimize': ['/input'],
                'oc_lr': 0.01,
                'iteration': 1000,
                'share_gammas': False,
                'lr_decay': 0.99,
                'early_stop_patience': 1000,
                'init_alpha': False,
                'lr_alpha': 0.4,
                'start_save_best': -1,
            }
        },
            device=self.default_device
        )
        model.constraints = torch.tensor(
            [[[-1., 0., 1.]], [[0., -1., 1.]]], device=self.default_device, dtype=self.default_dtype)
        model.thresholds = torch.tensor(
            [0., 0.], device=self.default_device, dtype=self.default_dtype)

        norm = float("inf")
        lower = torch.tensor(
            [[-2., -2.], [-2., -2.]], device=self.default_device, dtype=self.default_dtype)
        upper = torch.tensor(
            [[0., 0.], [0., 0.]], device=self.default_device, dtype=self.default_dtype)
        ptb = PerturbationLpNorm(norm = norm, x_L=lower, x_U=upper)
        x_expand = BoundedTensor(torch.tensor(
            [[-1., -1.], [-1., -1.]], device=self.default_device, dtype=self.default_dtype), ptb)
        c = torch.tensor([[[-1.,  0.,  1.]], [[0., -1.,  1.]]],
                         device=self.default_device, dtype=self.default_dtype)

        # Init manually, to set bound_upper=False
        model.init_alpha(
                (x_expand,), share_alphas=False, c=c, bound_upper=False)

        model.compute_bounds(x=(x_expand,), C=c, method='CROWN-Optimized')

        if self.default_dtype == torch.float64:
            data_path = 'data_64/'
        else:
            data_path = 'data/'

        if self.generate:
            torch.save({
                'lower': model['/input'].lower,
                'upper': model['/input'].upper,
            }, data_path + 'invprop/ood_reference')
        else:
            data = torch.load(data_path + 'invprop/ood_reference')
            lower_ref = data['lower']
            upper_ref = data['upper']

            lower_diff = model['/input'].lower[0] - lower_ref[0]
            assert torch.allclose(model['/input'].lower[0], lower_ref[0], atol=1e-3), (lower_diff, lower_diff.abs().max())
            assert torch.all(torch.isposinf(lower_ref[1]))
            assert torch.all(torch.isposinf(model['/input'].lower[1]))
            upper_diff = model['/input'].upper[0] - upper_ref[0]
            assert torch.allclose(model['/input'].upper[0], upper_ref[0], atol=1e-3), (upper_diff, upper_diff.abs().max())
            assert torch.all(torch.isneginf(upper_ref[1]))
            assert torch.all(torch.isneginf(model['/input'].upper[1]))


if __name__ == '__main__':
    testcase = TestInvpropSimpleExample(generate=False)
    testcase.test()
    testcase = TestInvpropOODExample(generate=False)
    testcase.test()


================================================
FILE: tests/test_jacobian.py
================================================
# pylint: disable=wrong-import-position
"""Test Jacobian bounds."""
import sys
import torch
import torch.nn as nn

sys.path.append('../examples/vision')
from jacobian import compute_jacobians
from auto_LiRPA import BoundedModule
from auto_LiRPA.utils import Flatten
from auto_LiRPA.jacobian import JacobianOP
from testcase import TestCase, DEFAULT_DEVICE, DEFAULT_DTYPE


class TestJacobian(TestCase):
    def __init__(self, methodName='runTest', generate=False,
                 device=DEFAULT_DEVICE, dtype=DEFAULT_DTYPE):
        super().__init__(
            methodName, seed=1, ref_name='jacobian_test_data',
            generate=generate,
            device=device, dtype=dtype)

    def test(self):
        in_dim, linear_size = 8, 100
        model = nn.Sequential(
            Flatten(),
            nn.Linear(3*in_dim**2, linear_size),
            nn.ReLU(),
            nn.Linear(linear_size, linear_size),
            nn.Tanh(),
            nn.Linear(linear_size, linear_size),
            nn.Sigmoid(),
            nn.Linear(linear_size, 10),
        )
        model = model.to(device=self.default_device, dtype=self.default_dtype)
        x0 = torch.randn(1, 3, in_dim, in_dim,
                         device=self.default_device, dtype=self.default_dtype)
        self.result = compute_jacobians(model, x0)
        self.check()

    def test_concat_jacobian(self):
        '''
        Test JacobianOP with Concat operation. This needs some special handling
        in auto_LiRPA to make it work properly. (See parse_graph.py for details.)
        '''
        class ConcatModule(nn.Module):
            def forward(self, x):
                return JacobianOP.apply(torch.cat([x, x], dim=1), x)
        concatmodel = ConcatModule().to(device=self.default_device, dtype=self.default_dtype)
        x0 = torch.randn(1, 5, device=self.default_device, dtype=self.default_dtype)
        BoundedModule(concatmodel, x0)
        print('Concat JacobianOP test passed.')


if __name__ == '__main__':
    # Change to generate=True when genearting reference results
    testcase = TestJacobian(generate=False)
    testcase.setUp()
    testcase.test()


================================================
FILE: tests/test_language_models.py
================================================
"""Test classes for Transformer and LSTM on language tasks"""
import os
import argparse
import pickle
import torch
from auto_LiRPA.utils import logger

parser = argparse.ArgumentParser()
parser.add_argument('--gen_ref', action='store_true', help='generate reference results')
parser.add_argument('--train', action='store_true', help='pre-train the models')
parser.add_argument('--keep_results', action='store_true', help='keep intermediate results.')
parser.add_argument('--load_results', action='store_true', help='load intermediate results without reruning.')
args, unknown = parser.parse_known_args()

def prepare_data():
    os.system('cd ../examples/language;\
        wget http://download.huan-zhang.com/datasets/language/data_language.tar.gz;\
        tar xvf data_language.tar.gz')

cmd_transformer_train = 'cd ../examples/language; \
    DIR=model_transformer_test; \
    python train.py --hidden_size=16 --embedding_size=16 --intermediate_size=16 --max_sent_length=16 \
    --dir=$DIR --robust --method=IBP+backward_train \
    --num_epochs=2 --num_epochs_all_nodes=1 --eps_start=2 --train'
cmd_transformer_test = 'cd ../examples/language; \
    python train.py --hidden_size=16 --embedding_size=16 --intermediate_size=16 --max_sent_length=16 \
    --robust --method=IBP+backward --budget=1 --auto_test --eps=0.2 --load=../../tests/data/ckpt_transformer \
    --device=cpu'
cmd_lstm_train = 'cd ../examples/language; \
    DIR=model_lstm_test; \
    python train.py  --hidden_size=16 --embedding_size=16 --max_sent_length=16 \
    --dir=$DIR --model=lstm --lr=1e-3 --robust --method=IBP+backward_train --dropout=0.5 \
    --num_epochs=2 --num_epochs_all_nodes=1 --eps_start=2 --train'
cmd_lstm_test = 'cd ../examples/language; \
    python train.py --model=lstm --hidden_size=16 --embedding_size=16 --max_sent_length=16 \
    --robust --method=IBP+backward --budget=1 --auto_test --eps=0.2 --load=../../tests/data/ckpt_lstm \
    --device=cpu'
res_path = '../examples/language/res_test.pkl'

"""Pre-train a simple Transformer and LSTM respectively"""
def train():
    if os.path.exists("../examples/language/model_transformer_test"):
        os.system("rm -rf ../examples/language/model_transformer_test")
    if os.path.exists("../examples/language/model_lstm_test"):
        os.system("rm -rf ../examples/language/model_lstm_test")
    logger.info("\nTraining a Transformer")
    print(cmd_transformer_train)
    print()
    os.system(cmd_transformer_train)
    os.system("cp ../examples/language/model_transformer_test/ckpt_2 data/ckpt_transformer")
    logger.info("\nTraining an LSTM")
    print(cmd_lstm_train)
    print()
    os.system(cmd_lstm_train)
    os.system("cp ../examples/language/model_lstm_test/ckpt_2 data/ckpt_lstm")

def read_res():
    with open(res_path, 'rb') as file:
        return pickle.load(file)

def evaluate():
    if args.load_results:
        print("loading intermediate results...")
        with open("./tmp_language_results.pkl", "rb") as file:
            return pickle.load(file)
    logger.info('\nEvaluating the trained LSTM')
    print(cmd_lstm_test)
    print()
    os.system(cmd_lstm_test)
    res_lstm = read_res()
    logger.info('\nEvaluating the trained Transformer')
    print(cmd_transformer_test)
    print()
    os.system(cmd_transformer_test)
    res_transformer = read_res()
    os.system("rm {}".format(res_path))
    if args.keep_results:
        with open("./tmp_language_results.pkl", "wb") as file:
            pickle.dump((res_transformer, res_lstm), file)
        print("intermediate results saved.")
    return res_transformer, res_lstm

def gen_ref():
    if args.train:
        train()
    res_transformer, res_lstm = evaluate()
    with open('data/language_test_data', 'wb') as file:
        pickle.dump((res_transformer, res_lstm), file)
    logger.info('Reference results saved')

def check():
    with open('data/language_test_data', 'rb') as file:
        res_transformer_ref, res_lstm_ref = pickle.load(file)
    res_transformer, res_lstm = evaluate()
    for res, res_ref in zip([res_transformer, res_lstm], [res_transformer_ref, res_lstm_ref]):
        for a, b in zip(res, res_ref):
            ta, tb = torch.tensor(a), torch.tensor(b)
            diff = torch.max(torch.abs(ta - tb))
            assert diff < 1e-5, diff
            assert (torch.tensor(a) - torch.tensor(b)).pow(2).sum() < 1e-9

def test():
    if not os.path.exists('../examples/language/data'):
        prepare_data()
    if args.gen_ref:
        gen_ref()
    else:
        check()
    logger.info("test_Language done")

if __name__ == '__main__':
    test()


================================================
FILE: tests/test_linear_cnn_model.py
================================================
"""Test bounds on a 1 layer CNN network."""
import torch.nn as nn
from auto_LiRPA import BoundedModule, BoundedTensor
from auto_LiRPA.perturbations import *
from test_linear_model import TestLinearModel
from testcase import DEFAULT_DEVICE, DEFAULT_DTYPE

input_dim = 8
out_channel = 2
N = 10

class LinearCNNModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = nn.Conv2d(1, out_channel, 3, stride=2, padding=1)

    def forward(self, x):
        x = self.conv(x)
        x = x.view(-1, input_dim //2 * input_dim // 2 * out_channel)
        return x

class TestLinearCNNModel(TestLinearModel):
    def __init__(self, methodName='runTest', generate=False, device=DEFAULT_DEVICE, dtype=DEFAULT_DTYPE):
        super().__init__(methodName, device=device, dtype=dtype)
        self.original_model = LinearCNNModel().to(device=device, dtype=dtype)

    def compute_and_compare_bounds(self, eps, norm, IBP, method):
        input_data = torch.randn((N, 1, input_dim, input_dim))
        model = BoundedModule(self.original_model, torch.empty_like(input_data), device=self.default_device)
        ptb = PerturbationLpNorm(norm=norm, eps=eps)
        ptb_data = BoundedTensor(input_data, ptb)
        pred = model(ptb_data)
        label = torch.argmax(pred, dim=1).cpu().detach().numpy()
        # Compute bounds.
        lb, ub = model.compute_bounds(IBP=IBP, method=method)
        # Compute reference.
        conv_weight, conv_bias = list(model.parameters())
        conv_bias = conv_bias.view(1, out_channel, 1, 1)
        matrix_eye = torch.eye(input_dim * input_dim).view(input_dim * input_dim, 1, input_dim, input_dim)
        # Obtain equivalent weight and bias for convolution.
        weight = self.original_model.conv(matrix_eye) - conv_bias # Output is (batch, channel, weight, height).
        weight = weight.view(input_dim * input_dim, -1) # Dimension is (flattened_input, flattened_output).
        bias = conv_bias.repeat(1, 1, input_dim //2, input_dim //2).view(-1)
        flattend_data = input_data.view(N, -1)
        # Compute dual norm.
        if norm == 1:
            q = np.inf
        elif norm == np.inf:
            q = 1.0
        else:
            q = 1.0 / (1.0 - (1.0 / norm))
        # Manually compute bounds.
        norm = weight.t().norm(p=q, dim=1)
        expected_pred = flattend_data.matmul(weight) + bias
        expected_ub = eps * norm + expected_pred
        expected_lb = -eps * norm + expected_pred
        # Check equivalence.
        if method == 'backward' or method == 'forward':
            self.rtol = 1e-4
            self.assertEqual(expected_pred, pred)
            self.assertEqual(expected_ub, ub)
            self.assertEqual(expected_lb, lb)


================================================
FILE: tests/test_linear_model.py
================================================
"""Test bounds on a 1 layer linear network."""
import torch.nn as nn
from auto_LiRPA import BoundedModule, BoundedTensor
from auto_LiRPA.perturbations import *
from testcase import TestCase, DEFAULT_DEVICE, DEFAULT_DTYPE

n_classes = 3
N = 10

class LinearModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Linear(256, n_classes)

    def forward(self, x):
        x = self.fc(x)
        return x

class TestLinearModel(TestCase):
    def __init__(self, methodName='runTest', generate=False, device=DEFAULT_DEVICE, dtype=DEFAULT_DTYPE):
        super().__init__(methodName, seed=0, device=device, dtype=dtype)
        self.original_model = LinearModel().to(device=device, dtype=dtype)

    def compute_and_compare_bounds(self, eps, norm, IBP, method):
        input_data = torch.randn(
            (N, 256), device=self.default_device, dtype=self.default_dtype)
        model = BoundedModule(self.original_model, torch.empty_like(input_data), device=self.default_device)
        ptb = PerturbationLpNorm(norm=norm, eps=eps)
        ptb_data = BoundedTensor(input_data, ptb)
        pred = model(ptb_data)
        label = torch.argmax(pred, dim=1).cpu().detach().numpy()
        # Compute bounds.
        lb, ub = model.compute_bounds(IBP=IBP, method=method)
        # Compute dual norm.
        if norm == 1:
            q = np.inf
        elif norm == np.inf:
            q = 1.0
        else:
            q = 1.0 / (1.0 - (1.0 / norm))
        # Compute reference manually.
        weight, bias = list(model.parameters())
        norm = weight.norm(p=q, dim=1)
        expected_pred = input_data.matmul(weight.t()) + bias
        expected_ub = eps * norm + expected_pred
        expected_lb = -eps * norm + expected_pred

        # Check equivalence.
        self.rtol = 1e-4
        self.assertEqual(expected_pred, pred)
        self.assertEqual(expected_ub, ub)
        self.assertEqual(expected_lb, lb)

    def test_Linf_forward(self):
        with np.errstate(divide='ignore'):
            self.compute_and_compare_bounds(eps=0.3, norm=np.inf, IBP=False, method='forward')

    def test_Linf_backward(self):
        with np.errstate(divide='ignore'):
            self.compute_and_compare_bounds(eps=0.3, norm=np.inf, IBP=False, method='backward')

    def test_Linf_IBP(self):
        with np.errstate(divide='ignore'):
            self.compute_and_compare_bounds(eps=0.3, norm=np.inf, IBP=True, method=None)

    def test_Linf_backward_IBP(self):
        with np.errstate(divide='ignore'):
            self.compute_and_compare_bounds(eps=0.3, norm=np.inf, IBP=True, method='backward')

    def test_L2_forward(self):
        with np.errstate(divide='ignore'):
            self.compute_and_compare_bounds(eps=1.0, norm=2, IBP=False, method='forward')

    def test_L2_backward(self):
        with np.errstate(divide='ignore'):
            self.compute_and_compare_bounds(eps=1.0, norm=2, IBP=False, method='backward')

    def test_L2_IBP(self):
        with np.errstate(divide='ignore'):
            self.compute_and_compare_bounds(eps=1.0, norm=2, IBP=True, method=None)

    def test_L2_backward_IBP(self):
        with np.errstate(divide='ignore'):
            self.compute_and_compare_bounds(eps=1.0, norm=2, IBP=True, method='backward')

    def test_L1_forward(self):
        with np.errstate(divide='ignore'):
            self.compute_and_compare_bounds(eps=3.0, norm=1, IBP=False, method='forward')

    def test_L1_backward(self):
        with np.errstate(divide='ignore'):
            self.compute_and_compare_bounds(eps=3.0, norm=1, IBP=False, method='backward')

    def test_L1_IBP(self):
        with np.errstate(divide='ignore'):
            self.compute_and_compare_bounds(eps=3.0, norm=1, IBP=True, method=None)

    def test_L1_backward_IBP(self):
        with np.errstate(divide='ignore'):
            self.compute_and_compare_bounds(eps=3.0, norm=1, IBP=True, method='backward')


================================================
FILE: tests/test_maxpool.py
================================================
"""Test max pooling."""
import torch
import torch.nn as nn
import torch.nn.functional as F

from auto_LiRPA import BoundedModule, BoundedTensor
from auto_LiRPA.perturbations import *
from testcase import TestCase, DEFAULT_DEVICE, DEFAULT_DTYPE


class Model(nn.Module):
    def __init__(self, kernel_size=4, stride=4, padding=0, conv_padding=0):
        super(Model, self).__init__()
        self.n_n_conv2d = nn.Conv2d(**{'groups': 1, 'dilation': [1, 1], 'out_channels': 1, 'padding': conv_padding, 'kernel_size': (2, 2), 'stride': [1, 1], 'in_channels': 1, 'bias': True})
        self.n_n_maxpool = nn.MaxPool2d(**{'kernel_size': [kernel_size, kernel_size], 'ceil_mode': False, 'stride': [stride, stride], 'padding': [padding, padding]})
        self.n_n_conv2d_2 = nn.Conv2d(**{'groups': 1, 'dilation': [1, 1], 'out_channels': 1, 'padding': [conv_padding, conv_padding], 'kernel_size': (2, 2), 'stride': [1, 1], 'in_channels': 1, 'bias': True})
        self.n_n_maxpool_2 = nn.MaxPool2d(**{'kernel_size': [kernel_size, kernel_size], 'ceil_mode': False, 'stride': [stride, stride], 'padding': [padding, padding]})
        self.n_n_flatten_Flatten = nn.Flatten(**{'start_dim': 1})

        self.n_n_dense = None

        self.n_n_activation_Flatten = nn.Flatten(**{'start_dim': 1})

    def forward(self, *inputs):
        t_ImageInputLayer, = inputs
        t_conv2d = self.n_n_conv2d(t_ImageInputLayer)
        t_conv2d_relu = F.relu(t_conv2d)
        t_maxpool = self.n_n_maxpool(t_conv2d_relu)[:, :, :, :]
        t_conv2d_max = self.n_n_conv2d_2(t_maxpool)
        t_conv2d_max = F.relu(t_conv2d_max)
        # t_maxpool_2 = self.n_n_maxpool_2(t_conv2d_max)
        t_flatten_Transpose = t_conv2d_max.permute(*[0, 2, 3, 1])
        t_flatten_Flatten = self.n_n_flatten_Flatten(t_flatten_Transpose)
        t_flatten_Unsqueeze = torch.unsqueeze(t_flatten_Flatten, 2)
        t_flatten_Unsqueeze = torch.unsqueeze(t_flatten_Unsqueeze, 3)

        if self.n_n_dense is None:
            self.n_n_dense = nn.Conv2d(**{'groups': 1, 'dilation': [1, 1], 'out_channels': 2, 'padding': [0, 0], 'kernel_size': (1, 1), 'stride': [1, 1], 'in_channels': t_flatten_Unsqueeze.shape[1], 'bias': True})
        t_dense = self.n_n_dense(t_flatten_Unsqueeze)
        t_activation_Flatten = self.n_n_activation_Flatten(t_dense)

        return t_activation_Flatten

class TestMaxPool(TestCase):
    def __init__(self, methodName='runTest', generate=False, device=DEFAULT_DEVICE, dtype=DEFAULT_DTYPE):
        super().__init__(methodName,
            seed=1, ref_name=None,
            generate=generate,
            device=device, dtype=dtype)

    def test(self):
        if self.default_dtype == torch.float64:
            data_path = 'data_64/'
        else:
            data_path = 'data/'

        N = 2

        for kernel_size in [3,4]:
            for padding in [0,1]:
                for conv_padding in [0,1]:
                    print(kernel_size, padding, kernel_size, conv_padding)

                    model_ori = Model(kernel_size=kernel_size, padding=padding, stride=kernel_size, conv_padding=conv_padding).to(
                        device=self.default_device, dtype=self.default_dtype)
                    if not self.generate:
                        data = torch.load(data_path + 'maxpool_test_data_{}-{}-{}-{}'.format(kernel_size, padding, kernel_size, conv_padding), weights_only=False)
                        image = data['input']
                        model_ori(image)
                        model_ori.load_state_dict(data['model'])
                    else:
                        image = torch.rand([N, 1, 28, 28])
                        model_ori(image)

                    if self.generate:
                        conv_mode = "matrix"
                    else:
                        conv_mode = "patches"

                    model = BoundedModule(model_ori, image, device=self.default_device, bound_opts={"conv_mode": conv_mode})
                    eps = 0.3
                    norm = np.inf
                    ptb = PerturbationLpNorm(norm=norm, eps=eps)
                    image = BoundedTensor(image, ptb)

                    lb, ub = model.compute_bounds((image,))

                    if self.generate:
                        torch.save(
                            {'model': model_ori.state_dict(),
                            'input': image,
                            'lb': lb,
                            'ub': ub}, data_path + 'maxpool_test_data_{}-{}-{}-{}'.format(kernel_size, padding, kernel_size, conv_padding)
                        )

                    if not self.generate:
                        lb_ref = data['lb']
                        ub_ref = data['ub']

                        assert torch.allclose(lb, lb_ref, 1e-4)
                        assert torch.allclose(ub, ub_ref, 1e-4)


if __name__ == '__main__':
    testcase = TestMaxPool(generate=False)
    testcase.test()


================================================
FILE: tests/test_min_max.py
================================================
import os
import torch
import torch.nn as nn
import torchvision
from auto_LiRPA import BoundedModule, BoundedTensor
from auto_LiRPA.perturbations import PerturbationLpNorm
from auto_LiRPA.utils import *
from testcase import TestCase, DEFAULT_DEVICE, DEFAULT_DTYPE 

class Test_Model(nn.Module):
    def __init__(self):
        super(Test_Model, self).__init__()

        self.seq1 = nn.Sequential(
            nn.Conv2d(1, 16, 4, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(16, 32, 4, stride=2, padding=1)
        )

        self.seq2 = nn.Sequential(
            nn.Conv2d(1, 16, 4, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(16, 32, 4, stride=2, padding=1)
        )

        self.seq3 = nn.Sequential(
            nn.Conv2d(32, 8, 2, stride=2, padding=1),
            nn.ReLU(),
            Flatten(),
            nn.Linear(8*4*4,100),
            nn.ReLU(),
            nn.Linear(100, 10)
        )

    def forward(self, x):
        return self.seq3(torch.max(self.seq1(x), self.seq2(x)))

class TestMinMax(TestCase):
    def __init__(self, methodName='runTest', generate=False, device=DEFAULT_DEVICE, dtype=DEFAULT_DTYPE):
        super().__init__(methodName,
            seed=1, ref_name='min_max_test_data', generate=generate,
            device=device, dtype=dtype)

    def test(self):
        self.result = []
        for conv_mode in ['patches', 'matrix']:
            for use_shared_alpha in [True, False]:
                model = Test_Model().to(device=self.default_device, dtype=self.default_dtype)
                checkpoint = torch.load(
                    os.path.join(os.path.dirname(__file__), '../examples/vision/pretrained/test_min_max.pth'),
                    map_location=self.default_device)
                model.load_state_dict(checkpoint)

                test_data = torchvision.datasets.MNIST(
                    './data', train=False, download=True,
                    transform=torchvision.transforms.ToTensor())

                N = 2
                image = test_data.data[:N].view(N,1,28,28)
                image = image.to(device=self.default_device,
                                 dtype=self.default_dtype) / 255.0

                lirpa_model = BoundedModule(model, torch.empty_like(image), device=image.device, bound_opts={"conv_mode": conv_mode})

                eps = 0.3
                ptb = PerturbationLpNorm(eps = eps)
                image = BoundedTensor(image, ptb)

                lirpa_model.set_bound_opts({
                    'optimize_bound_args': {
                        'iteration': 5,
                        'lr_alpha': 0.1,
                        'use_shared_alpha': use_shared_alpha,
                    }
                })
                lb, ub = lirpa_model.compute_bounds(x=(image,), method='CROWN-Optimized')
                print(lb, ub)

                self.result.append((lb, ub))

        self.setUp()
        self.rtol = 1e-4
        self.check()

if __name__ == "__main__":
    testcase = TestMinMax(generate=False)
    testcase.test()


================================================
FILE: tests/test_perturbation.py
================================================
""" Test different Perturbation classes"""
import torch
import torch.nn as nn
import numpy as np

from auto_LiRPA import BoundedModule, BoundedTensor
from auto_LiRPA.perturbations import PerturbationLpNorm, PerturbationLinear
from testcase import TestCase, DEFAULT_DEVICE, DEFAULT_DTYPE


BATCH = 2
IN_DIM = 3
OUT_DIM = 4


class ToyModel(nn.Module):
    """Small model with two MatMuls and ReLU."""
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(OUT_DIM, 8)
        self.fc2 = nn.Linear(8, OUT_DIM)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x


class TestPerturbation(TestCase):
    """
    Tests for:
    - PerturbationLinear
    - PerturbationLpNorm
    """
    def __init__(self, methodName='runTest', seed=1, generate=False,
                 device=DEFAULT_DEVICE, dtype=DEFAULT_DTYPE):
        super().__init__(methodName, seed, 'test_perturbation_data', generate, device=device, dtype=dtype)

    def test(self):
        device = self.default_device
        dtype = self.default_dtype

        model = ToyModel().to(device=device, dtype=dtype)

        # Prepare base input interval
        input_lb = torch.rand(BATCH, IN_DIM, device=device, dtype=dtype)
        input_ub = input_lb + torch.rand_like(input_lb)    # ensure ub > lb

        self.result = []

        # =================================================================
        # Test PerturbationLinear
        # =================================================================
        # Build A matrices
        lower_A = torch.randn(BATCH, OUT_DIM, IN_DIM, device=device, dtype=dtype)
        upper_A = lower_A + torch.rand_like(lower_A)
        # biases
        lower_b = torch.randn(BATCH, OUT_DIM, device=device, dtype=dtype)
        upper_b = lower_b + torch.rand_like(lower_b)

        # Manual concretization
        mid = ((input_lb + input_ub) / 2.0).unsqueeze(-1)   # (B, IN_DIM, 1)
        diff = ((input_ub - input_lb) / 2.0).unsqueeze(-1)   # (B, IN_DIM, 1)

        manual_L = (lower_A @ mid - torch.abs(lower_A) @ diff).squeeze(-1) + lower_b
        manual_U = (upper_A @ mid + torch.abs(upper_A) @ diff).squeeze(-1) + upper_b
        assert (manual_L < manual_U).all(), "Invalid manual bounds construction."

        ptb_linear = PerturbationLinear(
            lower_A=lower_A, upper_A=upper_A, lower_b=lower_b, upper_b=upper_b,
            input_lb=input_lb, input_ub=input_ub,
            x_L=manual_L, x_U=manual_U
        )
        bounded_x = BoundedTensor((manual_L + manual_U) / 2, ptb_linear)
        lirpa_model = BoundedModule(model, bounded_x)
        lb_linear, ub_linear = lirpa_model.compute_bounds(bounded_x, method='backward')
        assert (lb_linear <= ub_linear).all(), "Invalid bounds from PerturbationLinear."
        self.result.append((lb_linear, ub_linear))


        # =================================================================
        # Test PerturbationLpNorm
        # =================================================================
        # We directly use manual concretization here for testing
        ptb_linf = PerturbationLpNorm(x_L=manual_L, x_U=manual_U)
        bounded_x = BoundedTensor((manual_L + manual_U) / 2, ptb_linf)
        lirpa_model = BoundedModule(model, bounded_x)
        lb_linf, ub_linf = lirpa_model.compute_bounds(bounded_x, method='backward')
        assert (lb_linf <= ub_linf).all(), "Invalid bounds from PerturbationLpNorm."
        self.result.append((lb_linf, ub_linf))

        # Notice that with the same x_L and x_U, PerturbationLinear should give
        # tighter bounds than PerturbationLpNorm. This is because
        # PerturbationLinear uses additional information (A matrices and biases).
        assert (lb_linear >= lb_linf).all() and (ub_linear <= ub_linf).all(
        ), "PerturbationLinear should give tighter bounds than PerturbationLpNorm."

        self.check()


if __name__ == '__main__':
    testcase = TestPerturbation(generate=False)
    testcase.test()


================================================
FILE: tests/test_rectangle_patches.py
================================================
import sys
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from auto_LiRPA import BoundedModule, BoundedTensor
from auto_LiRPA.perturbations import *
sys.path.append('../examples/vision')
from testcase import TestCase, DEFAULT_DEVICE, DEFAULT_DTYPE

class cnn_4layer_resnet(nn.Module):
    def __init__(self):
        super(cnn_4layer_resnet, self).__init__()
        self.conv1 = nn.Conv2d(3, 3, 4, stride=2, padding=1)
        self.bn = nn.BatchNorm2d(3)
        self.shortcut = nn.Conv2d(3, 3, 4, stride=2, padding=1)
        self.conv2 = nn.Conv2d(3, 3, 4, stride=2, padding=1)
        self.fc1 = nn.Linear(168, 10)

    def forward(self, x):
        x_ = x
        x = F.relu(self.conv1(self.bn(x)))
        x += self.shortcut(x_)
        x = F.relu(self.conv2(x))
        x = x.view(x.size(0), -1)
        print(x.size())
        x = self.fc1(x)

        return x

class TestResnetPatches(TestCase): 
    def __init__(self, methodName='runTest', generate=False, device=DEFAULT_DEVICE, dtype=DEFAULT_DTYPE):
        super().__init__(methodName, 
            seed=1234, ref_name='rectangle_patches_test_data',
            generate=generate,
            device=device, dtype=dtype)

    def test(self):
        model_oris = [
            cnn_4layer_resnet(),
        ]
        self.result = []
        if not self.generate:
            self.reference = torch.load(
                self.ref_path, map_location=self.default_device)

        for model_ori in model_oris:
            conv_mode = 'patches' # conv_mode can be set as 'matrix' or 'patches'        
                
            normalize = torchvision.transforms.Normalize(mean = [0.4914, 0.4822, 0.4465], std = [0.2023, 0.1994, 0.2010])
            test_data = torchvision.datasets.CIFAR10("./data", train=False, download=True, 
                            transform=torchvision.transforms.Compose([torchvision.transforms.ToTensor(), normalize]))
            N = 1
            n_classes = 10

            image = torch.Tensor(test_data.data[:N]).reshape(N,3,32,32)
            image = image[:, :, :28, :]
            image = image.to(device=self.default_device,
                             dtype=self.default_dtype) / 255.0

            model_ori = model_ori.to(
                device=self.default_device, dtype=self.default_dtype)
            model = BoundedModule(model_ori, image, bound_opts={
                                  "conv_mode": conv_mode}, device=self.default_device)

            ptb = PerturbationLpNorm(norm = np.inf, eps = 0.03)
            image = BoundedTensor(image, ptb)
            pred = model(image)
            lb, ub = model.compute_bounds(IBP=False, C=None, method='backward')
            self.result += [lb, ub]

        self.check()

if __name__ == '__main__':
    # Change to generate=True when genearting reference results
    testcase = TestResnetPatches(generate=False)
    testcase.test()

================================================
FILE: tests/test_resnet_patches.py
================================================
import sys
import torch
import numpy as np
import torchvision
import models
from auto_LiRPA import BoundedModule, BoundedTensor
from auto_LiRPA.perturbations import *
from testcase import TestCase, DEFAULT_DEVICE, DEFAULT_DTYPE
sys.path.append('../examples/vision')


class TestResnetPatches(TestCase):
    def __init__(self, methodName='runTest', generate=False, device=DEFAULT_DEVICE, dtype=DEFAULT_DTYPE):
        super().__init__(methodName,
            seed=1234, ref_name='resnet_patches_test_data',
            generate=generate,
            device=device, dtype=dtype)

    def test(self):
        model_oris = [
            models.model_resnet(width=1, mult=2),
            models.ResNet18(in_planes=2)
        ]
        self.result = []

        for model_ori in model_oris:
            conv_mode = 'patches' # conv_mode can be set as 'matrix' or 'patches'

            normalize = torchvision.transforms.Normalize(mean = [0.4914, 0.4822, 0.4465], std = [0.2023, 0.1994, 0.2010])
            test_data = torchvision.datasets.CIFAR10("./data", train=False, download=True,
                            transform=torchvision.transforms.Compose([torchvision.transforms.ToTensor(), normalize]))
            N = 1
            n_classes = 10

            image = torch.Tensor(test_data.data[:N]).reshape(N,3,32,32)
            image = image.to(device=self.default_device,
                             dtype=self.default_dtype) / 255.0

            model_ori = model_ori.to(
                device=self.default_device, dtype=self.default_dtype)
            model = BoundedModule(model_ori, image, bound_opts={"conv_mode": conv_mode}, device=self.default_device)

            ptb = PerturbationLpNorm(norm = np.inf, eps = 0.03)
            image = BoundedTensor(image, ptb)
            pred = model(image)
            lb, ub = model.compute_bounds(IBP=False, C=None, method='backward')
            self.result += [lb, ub]

        self.check()

if __name__ == '__main__':
    # Change to generate=True when genearting reference results
    testcase = TestResnetPatches(generate=False)
    testcase.test()

================================================
FILE: tests/test_s_shaped.py
================================================
# pylint: disable=wrong-import-position
"""Test S-shaped activation functions."""
import torch
import torch.nn as nn
from auto_LiRPA import BoundedModule, BoundedTensor
from auto_LiRPA.perturbations import PerturbationLpNorm
from testcase import TestCase, DEFAULT_DEVICE, DEFAULT_DTYPE

class test_model(nn.Module):
    def __init__(self, act_func):
        super().__init__()
        self.act_func = act_func

    def forward(self, x):
        return self.act_func(x)

def sigmoid(x):
    return torch.sigmoid(x)

def sin(x):
    return torch.sin(x)


def verify_bounds(model, input_lb, input_ub, lb, ub):
    """
    Empirically verify that the model's output bounds are correct given input bounds.

    Args:
        model: The neural network model.
        input_lb: Lower bound of the input.
        input_ub: Upper bound of the input.
        lb: Computed lower bound of the output.
        ub: Computed upper bound of the output.
    """
    n_samples = 100000
    atol = 1e-5
    inputs = torch.rand(n_samples, *input_lb.shape[1:]) * (input_ub - input_lb) + input_lb
    outputs = model(inputs)
    empirical_lb = outputs.min(dim=0).values
    empirical_ub = outputs.max(dim=0).values
    if not (empirical_lb - lb >= -atol).all():
        max_violation = (lb - empirical_lb).max().item()
        raise AssertionError(f"Lower bound violated. Max violation: {max_violation}")
    if not (empirical_ub - ub <= atol).all():
        max_violation = (empirical_ub - ub).max().item()
        raise AssertionError(f"Upper bound violated. Max violation: {max_violation}")
    print("Bounds verified successfully.")


class TestSShaped(TestCase):
    def __init__(self, methodName='runTest', generate=False,
                 device=DEFAULT_DEVICE, dtype=DEFAULT_DTYPE):
        super().__init__(
            methodName, seed=1, ref_name='s_shape_test_data',
            generate=generate,
            device=device, dtype=dtype)

    def _run_bound_test(self, model, input_lb, input_ub, methods):
        """Helper to compute and verify bounds."""
        model = model.to(device=self.default_device, dtype=self.default_dtype)
        lirpa_model = BoundedModule(model, torch.empty_like(input_lb), device=self.default_device)
        ptb = PerturbationLpNorm(x_L=input_lb, x_U=input_ub)
        ptb_data = BoundedTensor(input_lb, ptb)

        for method in methods:
            lb, ub = lirpa_model.compute_bounds(x=(ptb_data,), method=method)
            verify_bounds(model, input_lb, input_ub, lb, ub)
            self.result.append((lb, ub))

    def test(self):
        self.result = []
        methods = ['CROWN', 'CROWN-OPTIMIZED']

        # ----- Test BoundSin -----
        model_sin = test_model(sin)
        start, end = -10, 10
        n_intervals = end - start - 1

        # Inputs as multiples of pi
        input_lb = torch.linspace(start, end - 1, n_intervals) * torch.pi
        input_ub = torch.linspace(start + 1, end, n_intervals) * torch.pi
        input_lb, input_ub = input_lb.unsqueeze(0), input_ub.unsqueeze(0)

        self._run_bound_test(model_sin, input_lb, input_ub, methods)

        # Inputs as multiples of pi / 2
        self._run_bound_test(model_sin, input_lb / 2, input_ub / 2, methods)

        # ----- Test BoundSigmoid -----
        model_sigmoid = test_model(sigmoid)
        input_lb = torch.tensor([[-2., -0.1]], device=self.default_device, dtype=self.default_dtype)
        input_ub = torch.tensor([[0.1, 2.]], device=self.default_device, dtype=self.default_dtype)

        self._run_bound_test(model_sigmoid, input_lb, input_ub, methods)

        # Check reference results
        self.check()


if __name__ == '__main__':
    # Change to generate=True when generating reference results
    testcase = TestSShaped(generate=False)
    testcase.setUp()
    testcase.test()


================================================
FILE: tests/test_save_intermediate.py
================================================
import torch
import torch.nn as nn
from auto_LiRPA import BoundedModule, BoundedTensor
from auto_LiRPA.perturbations import *
from testcase import _to, TestCase, DEFAULT_DEVICE, DEFAULT_DTYPE

class test_model(nn.Module):
    def __init__(self):
        super(test_model, self).__init__()
        self.model = nn.Sequential(
            nn.Flatten(),
            nn.Linear(3 * 32 * 32, 1000),
            nn.Sigmoid(),
            nn.Linear(1000, 500),
            nn.Linear(500, 200),
            nn.Linear(200, 100),
            nn.ReLU(),
            nn.Linear(100, 10)
        )

    def forward(self, x):
        x = self.model(x)
        return x

class TestSave(TestCase):
    def __init__(self, methodName='runTest', device=DEFAULT_DEVICE, dtype=DEFAULT_DTYPE):
        super().__init__(methodName, device=device, dtype=dtype)

    def test(self, gen_ref=False):
        image = torch.randn(1, 3, 32, 32)
        image = image.to(device=self.default_device,
                         dtype=self.default_dtype) / 255.0
        model = test_model().to(device=self.default_device, dtype=self.default_dtype)

        bounded_model = BoundedModule(
            model, image, bound_opts={
                'optimize_bound_args': {'iteration': 2},
            }, device=self.default_device)

        ptb = PerturbationLpNorm(eps=3/255)
        x = BoundedTensor(image, ptb)
        bounded_model.compute_bounds(x=(x,), method='CROWN-Optimized')
        if self.default_dtype == torch.float32:
            data_path = 'data/'
        elif self.default_dtype == torch.float64:
            data_path = 'data_64/'
        data_path += 'test_save_data'

        save_dict = bounded_model.save_intermediate(
            save_path=data_path if gen_ref else None)

        if gen_ref:
            torch.save(save_dict, data_path)
            return

        ref_dict = torch.load(data_path)
        ref_dict = _to(
            ref_dict, device=self.default_device, dtype=self.default_dtype)


        for node in ref_dict.keys():
            assert torch.allclose(ref_dict[node][0], save_dict[node][0], atol=1e-5)
            assert torch.allclose(ref_dict[node][1], save_dict[node][1], atol=1e-5)


if __name__ == '__main__':
    testcase = TestSave()
    testcase.test()


================================================
FILE: tests/test_simple_verification.py
================================================
"""Test optimized bounds in simple_verification."""
import torch
import torch.nn as nn
import torchvision
from auto_LiRPA import BoundedModule, BoundedTensor
from auto_LiRPA.perturbations import PerturbationLpNorm
from auto_LiRPA.utils import Flatten
from testcase import TestCase, DEFAULT_DEVICE, DEFAULT_DTYPE

# This simple model comes from https://github.com/locuslab/convex_adversarial
def mnist_model():
    model = nn.Sequential(
        nn.Conv2d(1, 16, 4, stride=2, padding=1),
        nn.ReLU(),
        nn.Conv2d(16, 32, 4, stride=2, padding=1),
        nn.ReLU(),
        Flatten(),
        nn.Linear(32*7*7,100),
        nn.ReLU(),
        nn.Linear(100, 10)
    )
    return model

class TestSimpleVerification(TestCase):
    def __init__(self, methodName='runTest', device=DEFAULT_DEVICE, dtype=DEFAULT_DTYPE):
        super().__init__(methodName, device=device, dtype=dtype)

    def test(self):
      model = mnist_model()
      checkpoint = torch.load(
        '../examples/vision/pretrained/mnist_a_adv.pth',
        map_location=torch.device('cpu'))
      model.load_state_dict(checkpoint)
      model = model.to(device=self.default_device, dtype=self.default_dtype)
      test_data = torchvision.datasets.MNIST(
        './data', train=False, download=True, transform=torchvision.transforms.ToTensor())
      N = 2
      image = test_data.data[:N].view(N,1,28,28)
      image = image.to(device=self.default_device,
                       dtype=self.default_dtype) / 255.0

      lirpa_model = BoundedModule(model, torch.empty_like(image), device=self.default_device)
      ptb = PerturbationLpNorm(0.3)
      image = BoundedTensor(image, ptb)

      method = 'CROWN-Optimized (alpha-CROWN)'
      lirpa_model.set_bound_opts({'optimize_bound_args': {'iteration': 20, 'lr_alpha': 0.1}})
      _, ub = lirpa_model.compute_bounds(x=(image,), method=method.split()[0])
      self.assertEqual(ub[0][7], torch.tensor(12.5080))

if __name__ == '__main__':
    testcase = TestSimpleVerification()
    testcase.test()


================================================
FILE: tests/test_state_dict_name.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from auto_LiRPA import BoundedModule
from testcase import TestCase, DEFAULT_DEVICE, DEFAULT_DTYPE

class FeatureExtraction(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 8, 4, stride=2, padding=1)
        self.conv2 = nn.Conv2d(8, 16, 4, stride=2, padding=1)
        self.fc1 = nn.Linear(784, 256)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = x.view(-1, 784)
        x = F.relu(self.fc1(x))
        return x


class cnn_MNIST(nn.Module):
    def __init__(self):
        super().__init__()
        self.features = BoundedModule(FeatureExtraction(), torch.empty((1, 1, 28, 28)))
        self.fc = nn.Linear(256, 10)

    def forward(self, x):
        x = self.features(x)
        return self.fc(x)


class TestStateDictName(TestCase):
    def __init__(self, methodName='runTest', generate=False, device=DEFAULT_DEVICE, dtype=DEFAULT_DTYPE):
        super().__init__(methodName, device=device, dtype=dtype)

    def test(self):
        model = cnn_MNIST().to(device=self.default_device, dtype=self.default_dtype)
        state_dict = model.state_dict()
        dummy = torch.randn((1, 1, 28, 28))
        ret1 = model(dummy)

        # create second model and load state_dict to test load_state_dict() whether works proper
        model = cnn_MNIST().to(device=self.default_device, dtype=self.default_dtype)
        model.load_state_dict(state_dict, strict=True)
        ret2 = model(dummy)
        self.assertEqual(ret1, ret2)


if __name__ == '__main__':
    # Change to generate=True when genearting reference results
    testcase = TestStateDictName(generate=False)
    testcase.test()


================================================
FILE: tests/test_tensor_storage.py
================================================

import random
import torch
from complete_verifier.tensor_storage import StackTensorStorage, QueueTensorStorage

from testcase import TestCase, DEFAULT_DEVICE, DEFAULT_DTYPE


class TestTensorStorage(TestCase):
    def __init__(self, methodName='runTest', device=DEFAULT_DEVICE, dtype=DEFAULT_DTYPE):
        super().__init__(methodName, device=device, dtype=dtype)
        
    def test_content(self, seed=123):
        self.set_seed(seed)
        storage_classes_and_pop_behavior = [
            (
                StackTensorStorage,
                lambda tensor_list, num_pop: (tensor_list[-num_pop:], tensor_list[:-num_pop])
            ),
            (
                QueueTensorStorage,
                lambda tensor_list, num_pop: (tensor_list[:num_pop], tensor_list[num_pop:])
            )
        ]

        for storage_class, pop_behavior in storage_classes_and_pop_behavior:
            for concat_dim in [0, 1, 2]:
                # The call to `.size()` has side effects for `QueueTensorStorage`, because it will
                # cause a call to `.tensor()` which may change the internal storage.
                for check_size in [True, False]:
                    stored_tensors = []
                    shape = [2,3,4]
                    def make_random_tensor():
                        random_size = random.randint(1, 100)
                        tensors = []
                        for _ in range(random_size):
                            random_tensor = torch.randn(
                                shape[:concat_dim] + shape[concat_dim+1:], device=self.default_device, dtype=self.default_dtype).unsqueeze(concat_dim)
                            tensors.append(random_tensor)
                        return torch.cat(tensors, dim=concat_dim), tensors
                    s = storage_class(full_shape=shape, initial_size=16, switching_size=65536, concat_dim=concat_dim)
                    for _ in range(1000):
                        random_tensor, tensors = make_random_tensor()
                        s.append(random_tensor)
                        stored_tensors.extend(tensors)
                        if check_size:
                            assert s.size(concat_dim) == len(stored_tensors)

                        num_pop = random.randint(1, 100)
                        popped_tensors, stored_tensors = pop_behavior(stored_tensors, num_pop)
                        popped_tensor = s.pop(num_pop)
                        assert torch.allclose(popped_tensor, torch.cat(popped_tensors, dim=concat_dim))
                        if check_size:
                            assert s.size(concat_dim) == len(stored_tensors)

    def test_tensor_call(self, seed=123):
        # The call to `.tensor()` has side effects for `QueueTensorStorage`, because it will
        # cause a call to `.size()` which may change the internal storage.
        self.set_seed(seed)
        pop_behavior = lambda tensor_list, num_pop: (tensor_list[:num_pop], tensor_list[num_pop:])

        for concat_dim in [0, 1, 2]:
            stored_tensors = []
            shape = [2,3,4]
            def make_random_tensor():
                random_size = random.randint(1, 100)
                tensors = []
                for _ in range(random_size):
                    random_tensor = torch.randn(shape[:concat_dim] + shape[concat_dim+1:], dtype=self.default_dtype).unsqueeze(concat_dim)
                    tensors.append(random_tensor)
                return torch.cat(tensors, dim=concat_dim), tensors
            s = QueueTensorStorage(full_shape=shape, initial_size=16, switching_size=16, concat_dim=concat_dim)
            for _ in range(1000):
                random_tensor, tensors = make_random_tensor()
                s.append(random_tensor)
                stored_tensors.extend(tensors)

                num_pop = random.randint(1, 10)
                _, stored_tensors = pop_behavior(stored_tensors, num_pop)
                _ = s.pop(num_pop)
                if s._usage_start + s.num_used > s._storage.size(concat_dim):
                    storage_content = s.tensor()
                    assert torch.allclose(storage_content, torch.cat(stored_tensors, dim=concat_dim))


    def test_size_queue(self):
        for concat_dim in [0, 1, 2]:
            shape = [1,1,1]
            shape[concat_dim] = -1 # does no matter.
            zero_shape = shape.copy()
            zero_shape[concat_dim] = 0
            def make_tensor(x): return torch.arange(
                1, x+1, device=self.default_device, dtype=self.default_dtype).view(*shape)
            s = QueueTensorStorage(full_shape=shape, initial_size=16, switching_size=65536, concat_dim=concat_dim)
            s.append(make_tensor(1))
            assert s.sum() == 1, s.tensor()
            s.append(make_tensor(3))
            assert s.sum() == 1 + 6, s.tensor()
            s.append(make_tensor(5))
            assert s.sum() == 1 + 6 + 15, s.tensor()
            t = s.pop(5)
            assert torch.allclose(t.squeeze(), torch.tensor(
                [1, 1, 2, 3, 1], device=self.default_device, dtype=self.default_dtype))
            t = s.pop(0)
            assert t.shape == torch.Size(zero_shape)
            t = s.pop(-1)
            assert t.shape == torch.Size(zero_shape)
            s.append(make_tensor(100))
            expected_sum = 1 + sum(range(1,4)) + sum(range(1,6)) - (1 + 1 + 2 + 3 + 1) + sum(range(1,101))
            assert s.sum() == expected_sum, (s.sum(), expected_sum)
            t = s.pop(5)
            assert torch.allclose(t.squeeze(), torch.tensor(
                [2, 3, 4, 5, 1], device=self.default_device, dtype=self.default_dtype)), print(t)
            assert s.size(concat_dim) == 99, print(s.size())
            assert s._storage.size(concat_dim) == 104, print(s._storage.size())
            s.append(make_tensor(10))
            assert s.size(concat_dim) == 109, print(s.size())
            assert s._storage.size(concat_dim) == 208, print(s._storage.size())
            s.append(make_tensor(32768))
            assert s.size(concat_dim) == 32877, print(s.size())
            assert s._storage.size(concat_dim) == 32877, print(s._storage.size())
            s.pop(1)
            s.append(make_tensor(2))
            assert s.size(concat_dim) == 32878, print(s.size())
            assert s._storage.size(concat_dim) == 32877*2, print(s._storage.size())
            s.append(make_tensor(32800))
            s.append(make_tensor(100))
            assert s._storage.size(concat_dim) == 32877*2+100*32, print(s._storage.size())
            s.pop(100000)
            assert s._storage.size(concat_dim) == 32877*2+100*32, print(s._storage.size())
            assert s.size(concat_dim) == 0, print(s.size())
            t = s.pop(1)
            assert t.shape == torch.Size(zero_shape)
            t = s.pop(0)
            assert t.shape == torch.Size(zero_shape)
            t = s.pop(-1)
            assert t.shape == torch.Size(zero_shape)

    def test_size_stack(self):
        for concat_dim in [0, 1, 2]:
            shape = [1,1,1]
            shape[concat_dim] = -1 # does no matter.
            zero_shape = shape.copy()
            zero_shape[concat_dim] = 0
            make_tensor = lambda x: torch.arange(1,x+1, dtype=self.default_dtype).view(*shape)
            s = StackTensorStorage(full_shape=shape, initial_size=16, switching_size=65536, concat_dim=concat_dim)
            s.append(make_tensor(1))
            assert s.sum() == 1, print(s)
            s.append(make_tensor(3))
            assert s.sum() == 1 + 6, print(s)
            s.append(make_tensor(5))
            assert s.sum() == 1 + 6 + 15, print(s)
            t = s.pop(5)
            assert torch.allclose(t.squeeze(), torch.tensor(
                [1, 2, 3, 4, 5], device=self.default_device, dtype=self.default_dtype)), print(t)
            t = s.pop(0)
            assert t.shape == torch.Size(zero_shape)
            t = s.pop(-1)
            assert t.shape == torch.Size(zero_shape)
            s.append(make_tensor(100))
            assert s.sum() == 1 + 6 + 50*101
            t = s.pop(5)
            assert torch.allclose(t.squeeze(), torch.tensor(
                [96, 97, 98, 99, 100], device=self.default_device, dtype=self.default_dtype)), print(t)
            assert s.size(concat_dim) == 99, print(s.size())
            assert s._storage.size(concat_dim) == 104, print(s._storage.size())
            s.append(make_tensor(10))
            assert s.size(concat_dim) == 109, print(s.size())
            assert s._storage.size(concat_dim) == 208, print(s._storage.size())
            s.append(make_tensor(32768))
            assert s.size(concat_dim) == 32877, print(s.size())
            assert s._storage.size(concat_dim) == 32877, print(s._storage.size())
            s.pop(1)
            s.append(make_tensor(2))
            assert s.size(concat_dim) == 32878, print(s.size())
            assert s._storage.size(concat_dim) == 32877*2, print(s._storage.size())
            s.append(make_tensor(32800))
            s.append(make_tensor(100))
            assert s._storage.size(concat_dim) == 32877*2+100*32, print(s._storage.size())
            s.pop(100000)
            assert s._storage.size(concat_dim) == 32877*2+100*32, print(s._storage.size())
            assert s.size(concat_dim) == 0, print(s.size())
            t = s.pop(1)
            assert t.shape == torch.Size(zero_shape)
            t = s.pop(0)
            assert t.shape == torch.Size(zero_shape)
            t = s.pop(-1)
            assert t.shape == torch.Size(zero_shape)

if __name__ == "__main__":
    testcase = TestTensorStorage()
    testcase.test_tensor_call()
    testcase.test_size_stack()
    testcase.test_size_queue()
    testcase.test_content()


================================================
FILE: tests/test_upsample.py
================================================
from collections import defaultdict

from torch import nn
from auto_LiRPA import BoundedModule, BoundedTensor
from auto_LiRPA.perturbations import *

from testcase import TestCase, DEFAULT_DEVICE, DEFAULT_DTYPE

class Model(nn.Module):

    def __init__(self,
                 input_dim=5, image_size=4,
                 scale_factor=2, conv_kernel_size=3, stride=1, padding=1,
                 conv_in_channels=16, conv_out_channels=4):
        super(Model, self).__init__()
        self.conv_in_channels = conv_in_channels
        self.input_dim = input_dim
        self.image_size = image_size

        self.fc1 = nn.Linear(input_dim, conv_in_channels * image_size * image_size)
        self.upsample = nn.Upsample(scale_factor=(scale_factor, scale_factor), mode='nearest')
        # H = W = 4 * scale_factor now
        self.conv1 = nn.Conv2d(in_channels=conv_in_channels, out_channels=conv_out_channels,
                               kernel_size=(conv_kernel_size, conv_kernel_size), stride=(stride, stride), padding=padding)
        # H = W = (4 * scale + 2 * pad - ker + s) // s
        size_after_conv = (4 * scale_factor + 2 * padding - conv_kernel_size + stride) // stride
        assert size_after_conv > 0, "0 size after convolution, please use more padding, more scale_factor," \
                                    "smaller kernel, or smaller stride"
        self.relu = nn.ReLU()
        self.flatten = nn.Flatten()
        self.fc2 = nn.Linear(size_after_conv * size_after_conv * conv_out_channels, 1)
        # self.sigmoid = nn.Sigmoid()

    def forward(self, input_z):
        f1 = self.fc1(input_z)
        d1 = f1.reshape(-1, self.conv_in_channels, self.image_size, self.image_size)
        d2 = self.upsample(d1)
        d3 = self.conv1(d2)
        d4 = self.relu(d3)
        f2 = self.flatten(d4)
        f3 = self.fc2(f2)
        # out = self.sigmoid(f3)
        return f3

class ModelReducedCGAN(nn.Module):
    def __init__(self):
        """
            The network has the same architecture with merged bn CGAN upsampling one except reduced channel nums
        """
        super(ModelReducedCGAN, self).__init__()
        self.fc1 = nn.Linear(5, 32)
        self.up1 = nn.Upsample(scale_factor=2, mode='nearest')
        self.conv1 = nn.Conv2d(in_channels=2, out_channels=2, kernel_size=3, stride=1, padding=1)
        self.relu1 = nn.ReLU()
        self.up2 = nn.Upsample(scale_factor=2, mode='nearest')
        self.conv2 = nn.Conv2d(in_channels=2, out_channels=3, kernel_size=3, stride=1, padding=1)
        self.relu2 = nn.ReLU()
        self.up3 = nn.Upsample(scale_factor=2, mode='nearest')
        self.conv3 = nn.Conv2d(in_channels=3, out_channels=4, kernel_size=3, stride=1, padding=1)
        self.relu3 = nn.ReLU()
        self.conv4 = nn.Conv2d(in_channels=4, out_channels=2, kernel_size=3, stride=1, padding=1)
        self.conv5 = nn.Conv2d(in_channels=2, out_channels=3, kernel_size=3, stride=2, padding=1)
        self.relu4 = nn.ReLU()
        self.conv6 = nn.Conv2d(in_channels=3, out_channels=3, kernel_size=3, stride=2, padding=1)
        self.relu5 = nn.ReLU()
        self.conv7 = nn.Conv2d(in_channels=3, out_channels=4, kernel_size=3, stride=2, padding=1)
        self.relu6 = nn.ReLU()
        self.conv8 = nn.Conv2d(in_channels=4, out_channels=4, kernel_size=3, stride=2, padding=1)
        self.relu7 = nn.ReLU()
        self.fc2 = nn.Linear(4 * 2 * 2, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_z):
        f1 = self.fc1(input_z)
        f2 = f1.reshape(-1, 2, 4, 4)
        f3 = self.up1(f2)
        f4 = self.conv1(f3)
        f5 = self.relu1(f4)
        f6 = self.up2(f5)
        f7 = self.conv2(f6)
        f8 = self.relu2(f7)
        f9 = self.up3(f8)
        f10 = self.conv3(f9)
        f11 = self.relu3(f10)
        f12 = self.conv4(f11)
        f13 = self.conv5(f12)
        f14 = self.relu4(f13)
        f15 = self.conv6(f14)
        f16 = self.relu5(f15)
        f17 = self.conv7(f16)
        f18 = self.relu6(f17)
        f19 = self.conv8(f18)
        f20 = self.relu7(f19)
        f21 = f20.reshape(f20.shape[0], -1)
        f22 = self.fc2(f21)
        # f23 = self.sigmoid(f22)
        return f22


def recursive_allclose(a, b: dict, verbose=False, prefix=''):
    """
        Recursively check whether every corresponding tensors in two dicts are close
    :param a: dict a
    :param b: dict b
    :param prefix: reserved for path tracking in recursive calling for error printing
    :return: bool: all_close or not
    """
    tot_tensor = 0
    tot_dict = 0
    for k in a:
        if isinstance(a[k], torch.Tensor):
            if k == 'unstable_idx': continue
            if verbose:
                print(f'recursive_allclose(): Checking {prefix}{k}')
            assert k in b and isinstance(b[k], torch.Tensor) or isinstance(b[k], Patches), f'recursive_allclose(): Tensor not found in path {prefix}{k}'
            if isinstance(b[k], torch.Tensor):
                assert torch.allclose(a[k].reshape(-1), b[k].reshape(-1), 1e-4, 1e-5), f'recursive_allclose(): Inconsistency found in path {prefix}{k}'
            tot_tensor += 1
        elif isinstance(a[k], dict):
            assert k in b and isinstance(b[k], dict), f'recursive_allclose(): dict not found in path {prefix}{k}'
            recursive_allclose(a[k], b[k], verbose, prefix + k)
            tot_dict += 1
    tot_b_tensor = sum([1 if isinstance(v, torch.Tensor) or isinstance(v, Patches) and k != 'unstable_idx' else 0 for k, v in b.items()])
    tot_b_dict = sum([1 if isinstance(v, dict) else 0 for v in b.values()])
    assert tot_tensor == tot_b_tensor, f'recursive_allclose(): Extra tensors found in path {prefix}'
    assert tot_dict == tot_b_dict, f'recursive_allclose(): Extra recursive paths found in path {prefix}'
    return True


class TestUpSample(TestCase):
    def __init__(self, methodName='runTest', generate=False, device=DEFAULT_DEVICE, dtype=DEFAULT_DTYPE):
        super().__init__(methodName, seed=1, ref_name=None, generate=generate,
                         device=device, dtype=dtype)
        # self.device = device

    def test(self, seed=123):
        for kernel_size in [3,5]:
            for scaling_factor in [2,3,4]:
                for stride in [1,2]:
                    for padding in [1]:
                        self.test_instance(kernel_size, scaling_factor, stride, padding, seed=seed)

    def test_instance(self, kernel_size=3, scaling_factor=2, stride=1, padding=1, seed=123):
        self.set_seed(seed)

        print(f'kernel_size = {kernel_size}, scaling_factor = {scaling_factor}, stride = {stride}, padding = {padding}')
        random_input = torch.randn(
            (1, 5), device=self.default_device, dtype=self.default_dtype) * 1000.
        eps = 0.3

        model_ori = Model(scale_factor=scaling_factor,
                          conv_kernel_size=kernel_size,
                          stride=stride,
                          padding=padding).to(device=self.default_device, dtype=self.default_dtype)

        ptb = PerturbationLpNorm(norm=np.inf, eps=eps)
        z1_clean = random_input.detach().clone().requires_grad_(requires_grad=True)

        z1 = BoundedTensor(random_input, ptb)
        model_mat = BoundedModule(model_ori, (random_input,), device=self.default_device, bound_opts={"conv_mode": "matrix"})
        pred_of_mat = model_mat(z1)
        lb_m, ub_m, A_m = model_mat.compute_bounds(return_A=True, needed_A_dict={model_mat.output_name[0]: model_mat.input_name[0]}, )

        model_pat = BoundedModule(model_ori, (random_input,), device=self.default_device,
                                  bound_opts={"conv_mode": "patches"})
        pred_of_patch = model_pat(z1)
        lb_p, ub_p, A_p = model_pat.compute_bounds(return_A=True, needed_A_dict={
            model_pat.output_name[0]: model_pat.input_name[0]}, )

        assert torch.allclose(pred_of_mat, pred_of_patch, 1e-5)
        assert torch.allclose(lb_m, lb_p, 1e-5)
        assert torch.allclose(ub_m, ub_p, 1e-5)
        assert recursive_allclose(A_m, A_p, verbose=True)

class TestReducedCGAN(TestCase):

    def __init__(self, methodName='runTest', generate=False, device=DEFAULT_DEVICE, dtype=DEFAULT_DTYPE):
        super().__init__(methodName, seed=1, ref_name=None, generate=generate,
                         device=device, dtype=dtype)
        # self.device = device

    def test(self, seed=456):
        self.set_seed(seed)
        input = torch.tensor([[0.583, -0.97, -0.97, 0.598, 0.737]])
        eps = 0.1

        model_ori = ModelReducedCGAN().to(
            device=self.default_device, dtype=self.default_dtype)

        ptb = PerturbationLpNorm(norm=np.inf, eps=eps)
        z1_clean = input.detach().clone().requires_grad_(requires_grad=True)

        z1 = BoundedTensor(input, ptb)
        model_mat = BoundedModule(model_ori, (input,), device=self.default_device,
                                  bound_opts={"conv_mode": "matrix"})
        pred_of_mat = model_mat(z1)

        needed_A_dict = defaultdict(set)
        for node in model_mat.nodes():
            needed_A_dict[node.name] = set()

        lb_m, ub_m, A_m = model_mat.compute_bounds((z1,), return_A=True, needed_A_dict=needed_A_dict, method='crown')

        model_pat = BoundedModule(model_ori, (input,), device=self.default_device,
                                  bound_opts={"conv_mode": "patches", "sparse_features_alpha": False})
        pred_of_patch = model_pat(z1)
        lb_p, ub_p, A_p = model_pat.compute_bounds((z1,), return_A=True, needed_A_dict=needed_A_dict, method='crown')

        # print(pred_of_mat, pred_of_patch)
        assert torch.allclose(pred_of_mat, pred_of_patch, 1e-5)
        assert torch.allclose(lb_m, lb_p, 1e-5)
        assert torch.allclose(ub_m, ub_p, 1e-5)
        assert recursive_allclose(A_m, A_p, verbose=True)

if __name__ == '__main__':
    # should use device = 'cpu' for GitHub CI
    testcase = TestUpSample(generate=False)
    testcase.test(seed=123)

    # """
    #     following test is much stronger, but runs within 30s only on GPUs
    #     so commented it out for CI testing now
    #     required GPU memory: 1.5 GiB
    # """
    testhardcase = TestReducedCGAN(generate=False)
    testhardcase.test(seed=456)


================================================
FILE: tests/test_vision_models.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from auto_LiRPA import BoundedModule, BoundedTensor
from auto_LiRPA.perturbations import *
from testcase import _to, TestCase, DEFAULT_DEVICE, DEFAULT_DTYPE

class cnn_4layer_test(nn.Module):
    def __init__(self):
        super(cnn_4layer_test, self).__init__()
        self.conv1 = nn.Conv2d(3, 3, 4, stride=2, padding=1)
        self.bn = nn.BatchNorm2d(3)
        self.shortcut = nn.Conv2d(3, 3, 4, stride=2, padding=1)
        self.conv2 = nn.Conv2d(3, 3, 4, stride=2, padding=1)
        self.fc1 = nn.Linear(192, 10)

    def forward(self, x):
        x_ = x
        x = F.relu(self.conv1(self.bn(x)))
        x += self.shortcut(x_)
        x = F.relu(self.conv2(x))
        x = x.view(x.size(0), -1)
        x = self.fc1(x)

        return x

class TestVisionModels(TestCase):
    def __init__(self, methodName='runTest', ref_name='vision_test_data', model=cnn_4layer_test(), generate=False, device=DEFAULT_DEVICE, dtype=DEFAULT_DTYPE):
        super().__init__(methodName, seed=1234, ref_name=ref_name,
                         generate=generate, device=device, dtype=dtype)
        self.result = {}
        self.model = model.to(device=self.default_device,
                              dtype=self.default_dtype)

    def setUp(self):
        super().setUp()
        if self.reference:
            self.reference = _to(self.reference, self.default_device)
            self.reference = _to(self.reference, self.default_device)
        if self.generate:
            # state_dict from an existing reference is needed 
            self.reference = torch.load(self.ref_path)

    def verify_bounds(self, model, x, IBP, method, forward_ret, lb_name, ub_name):
        lb, ub = model(method_opt="compute_bounds", x=(x,), IBP=IBP, method=method)
        self.result[lb_name] = lb
        self.result[ub_name] = ub

        if method != 'CROWN-Optimized':
        # test gradient backward propagation
        # only when method is not "CROWN-Optimized" (in that case, lb and ub don't have gradient)
            loss = (ub - lb).abs().sum()
            loss.backward()
            grad = x.grad
            self.result[lb_name[:-2] + 'grad'] = grad.clone()

        if not self.generate:
            if method != 'CROWN-Optimized':
                assert torch.allclose(lb, self.reference[lb_name], 1e-4, atol=2e-7), (lb - self.reference[lb_name]).abs().max()
                assert torch.allclose(ub, self.reference[ub_name], 1e-4, atol=2e-7), (ub - self.reference[ub_name]).abs().max()
                assert ((lb - self.reference[lb_name]).pow(2).sum() < 1.3e-9), (lb - self.reference[lb_name]).pow(2).sum()
                assert ((ub - self.reference[ub_name]).pow(2).sum() < 1.3e-9), (ub - self.reference[ub_name]).pow(2).sum()
                if "same-slope" not in lb_name:
                    assert torch.allclose(grad, self.reference[lb_name[:-2] + 'grad'], 1e-4, 1e-6),  (grad - self.reference[lb_name[:-2] + 'grad']).abs().max()
                    assert (grad - self.reference[lb_name[:-2] + 'grad']).pow(2).sum() < 1.e-6, (grad - self.reference[lb_name[:-2] + 'grad']).pow(2).sum()
            else:
                assert torch.allclose(lb, self.reference[lb_name], 1e-4, atol=5e-6), (lb - self.reference[lb_name]).abs().max()
                assert torch.allclose(ub, self.reference[ub_name], 1e-4, atol=5e-6), (ub - self.reference[ub_name]).abs().max()
                assert ((lb - self.reference[lb_name]).pow(2).sum() < 1.3e-9), (lb - self.reference[lb_name]).pow(2).sum()
                assert ((ub - self.reference[ub_name]).pow(2).sum() < 1.3e-9), (ub - self.reference[ub_name]).pow(2).sum()


    def test_bounds(self, bound_opts=None, optimize = True):
        if bound_opts is None:
            bound_opts = {'activation_bound_option': 'same-slope'}
        np.random.seed(123)  # FIXME inconsistent seeds
        model_ori = self.model.eval()
        model_ori.load_state_dict(self.reference['model'])
        dummy_input = self.reference['data'].to(dtype=self.default_dtype, device=self.default_device)
        inputs = (dummy_input,)

        model = BoundedModule(model_ori, inputs, device=self.default_device)
        model.set_bound_opts({'optimize_bound_args': {'lr_alpha': 0.1}})
        forward_ret = model(dummy_input)
        model_ori.eval()

        assert torch.allclose(model_ori(dummy_input), model(dummy_input), 1e-4, 1e-6)

        model_same_slope = BoundedModule(model_ori, inputs, device=self.default_device, bound_opts=bound_opts)
        model_same_slope.set_bound_opts({'optimize_bound_args': {'lr_alpha': 0.1}})

        # Linf
        ptb = PerturbationLpNorm(norm=np.inf, eps=0.01)
        x = BoundedTensor(dummy_input, ptb)
        x.requires_grad_()

        self.verify_bounds(model, x, IBP=True, method=None, forward_ret=forward_ret, lb_name='l_inf_IBP_lb',
                    ub_name='l_inf_IBP_ub')  # IBP
        self.verify_bounds(model, x, IBP=True, method='backward', forward_ret=forward_ret, lb_name='l_inf_CROWN-IBP_lb',
                    ub_name='l_inf_CROWN-IBP_ub')  # CROWN-IBP
        self.verify_bounds(model, x, IBP=False, method='backward', forward_ret=forward_ret, lb_name='l_inf_CROWN_lb',
                    ub_name='l_inf_CROWN_ub')  # CROWN
        self.verify_bounds(model_same_slope, x, IBP=False, method='backward', forward_ret=forward_ret, lb_name='l_inf_CROWN-same-slope_lb',
                    ub_name='l_inf_CROWN-same-slope_ub') # CROWN-same-slope
        if optimize:
            self.verify_bounds(model, x, IBP=False, method='CROWN-Optimized', forward_ret=forward_ret, lb_name='l_inf_CROWN-Optimized_lb',
                        ub_name='l_inf_CROWN-Optimized_ub') # CROWN-Optimized
            self.verify_bounds(model_same_slope, x, IBP=False, method='CROWN-Optimized', forward_ret=forward_ret, lb_name='l_inf_CROWN-Optimized-same-slope_lb',
                        ub_name='l_inf_CROWN-Optimized-same-slope_ub')  # Crown-Optimized-same-slope


        # L2
        ptb = PerturbationLpNorm(norm=2, eps=0.01)
        x = BoundedTensor(dummy_input, ptb)
        x.requires_grad_()

        self.verify_bounds(model, x, IBP=True, method=None, forward_ret=forward_ret, lb_name='l_2_IBP_lb',
                    ub_name='l_2_IBP_ub')  # IBP
        self.verify_bounds(model, x, IBP=True, method='backward', forward_ret=forward_ret, lb_name='l_2_CROWN-IBP_lb',
                    ub_name='l_2_CROWN-IBP_ub')  # CROWN-IBP
        self.verify_bounds(model, x, IBP=False, method='backward', forward_ret=forward_ret, lb_name='l_2_CROWN_lb',
                    ub_name='l_2_CROWN_ub')  # CROWN
        self.verify_bounds(model_same_slope, x, IBP=False, method='backward', forward_ret=forward_ret, lb_name='l_2_CROWN-same-slope_lb',
                    ub_name='l_2_CROWN-same-slope_ub') # CROWN-same-slope
        if optimize:
            self.verify_bounds(model, x, IBP=False, method='CROWN-Optimized', forward_ret=forward_ret, lb_name='l_2_CROWN-Optimized_lb',
                        ub_name='l_2_CROWN-Optimized_ub') # CROWN-Optimized
            self.verify_bounds(model_same_slope, x, IBP=False, method='CROWN-Optimized', forward_ret=forward_ret, lb_name='l_2_CROWN-Optimized-same-slope_lb',
                        ub_name='l_2_CROWN-Optimized-same-slope_ub')  # Crown-Optimized-same-slope

        if self.generate:
            self.result['data'] = self.reference['data']
            self.result['model'] = self.reference['model']
            self.save()


if __name__ =="__main__":
    t = TestVisionModels(generate=False)
    # t = TestVisionModels()
    t.setUp()
    t.test_bounds()


================================================
FILE: tests/test_vision_models_hardtanh.py
================================================
import torch.nn as nn
import torch.nn.functional as F
from auto_LiRPA.perturbations import *
from test_vision_models import TestVisionModels
from testcase import DEFAULT_DEVICE, DEFAULT_DTYPE

class cnn_4layer_test_hardtanh(nn.Module):
    def __init__(self, in_ch, in_dim, width=2, linear_size=256):
        super(cnn_4layer_test_hardtanh, self).__init__()
        self.conv1 = nn.Conv2d(in_ch, 4 * width, 4, stride=2, padding=1)
        self.conv2 = nn.Conv2d(4 * width, 8 * width, 4, stride=2, padding=1)
        self.fc1 = nn.Linear(8 * width * (in_dim // 4) * (in_dim // 4), linear_size)
        self.fc2 = nn.Linear(linear_size, 10)

    def forward(self, x):
        x = F.hardtanh(self.conv1(x))
        x = F.hardtanh(self.conv2(x))
        x = torch.flatten(x, 1)
        x = F.hardtanh(self.fc1(x))
        x = self.fc2(x)

        return x

class TestCustomVisionModel(TestVisionModels):
    def __init__(self, methodName='runTest', model=cnn_4layer_test_hardtanh(in_ch=1, in_dim=28), generate=False, device=DEFAULT_DEVICE, dtype=DEFAULT_DTYPE):
        super().__init__(methodName, 'vision_clip_test_data', model, generate, device=device, dtype=dtype)

    def test_bounds(self, bound_opts=None, optimize=False):
        if bound_opts is None:
            bound_opts = {'hardtanh': 'same-slope'}
        super().test_bounds(bound_opts=bound_opts, optimize=optimize)

if __name__ == "__main__":
    t = TestCustomVisionModel()
    t.setUp()
    t.test_bounds()


================================================
FILE: tests/test_weight_perturbation.py
================================================
import copy
import subprocess
import numpy as np
from testcase import TestCase, DEFAULT_DEVICE, DEFAULT_DTYPE 
import sys
sys.path.append('../examples/vision')
import models
from auto_LiRPA import BoundedModule
from auto_LiRPA.perturbations import *


class TestWeightPerturbation(TestCase):
    def __init__(self, methodName='runTest', generate=False, device=DEFAULT_DEVICE, dtype=DEFAULT_DTYPE):
        super().__init__(
            methodName, seed=1234,
            ref_name='weight_perturbation_test_data', generate=generate,
            device=device, dtype=dtype)
        self.result = {}

    def test_training(self):
        # python weight_perturbation_training.py --device cpu --scheduler_opts start=1,length=100 --num_epochs 1  --truncate_data 5
        ret = subprocess.run(
            ['python', 'weight_perturbation_training.py',
            '--device', 'cpu',
            '--scheduler_opts', 'start=1,length=100',
            '--num_epochs',  '1',
            '--truncate_data', '5'],
            cwd='../examples/vision', capture_output=True)
        self.assertEqual(ret.returncode, 0, ret.stderr)
        res_test = ret.stdout.decode().split('\n')[-2].split(' ')
        assert abs(float(res_test[-3].split('=')[1]) - 2.246) < 0.01

    def verify_bounds(self, model, x, IBP, method, forward_ret, lb_name, ub_name):
        lb, ub = model(method_opt="compute_bounds", x=(x,), IBP=IBP, method=method)
        self.result[lb_name] = lb.detach().data.clone()
        self.result[ub_name] = ub.detach().data.clone()

        # test gradient backward propagation
        loss = (ub - lb).abs().sum()
        loss.backward()
        # gradient w.r.t input only
        grad = x.grad
        self.result[lb_name+'_grad'] = grad.detach().data.clone()

        if not self.generate:
            assert torch.allclose(self.reference[lb_name], self.result[lb_name], 1e-4, 1e-6)
            assert torch.allclose(self.reference[ub_name], self.result[ub_name], 1e-4, 1e-6)
            assert ((self.reference[lb_name] - self.result[lb_name]).pow(2).sum() < 1e-8)
            assert ((self.reference[ub_name] - self.result[ub_name]).pow(2).sum() < 1e-8)
            assert torch.allclose(self.reference[lb_name+'_grad'],
                                  self.result[lb_name + '_grad'], 1e-4, 1e-6)
            assert ((self.reference[lb_name + '_grad']
                     - self.result[lb_name + '_grad']).pow(2).sum() < 1e-8)

    def test_perturbation(self):
        np.random.seed(123) # FIXME This seed is inconsistent with other seeds (1234)

        model_ori = models.Models['mlp_3layer_weight_perturb'](pert_weight=True, pert_bias=True).eval()
        self.result['model'] = model_ori.state_dict()
        self.result['data'] = torch.randn(8, 1, 28, 28)
        model_ori.load_state_dict(self.result['model'])
        state_dict = copy.deepcopy(model_ori.state_dict())
        dummy_input = self.result['data'].requires_grad_()
        inputs = (dummy_input,)

        model = BoundedModule(model_ori, inputs, bound_opts={
            'sparse_intermediate_bounds': False, 'sparse_conv_intermediate_bounds': False, 'sparse_intermediate_bounds_with_ibp': False}, device=self.default_device)
        forward_ret = model(dummy_input)
        model_ori.eval()

        assert torch.isclose(model_ori(dummy_input), model_ori(dummy_input), 1e-8).all()

        def verify_model(pert_weight=True, pert_bias=True, norm=np.inf, lb_name='', ub_name=''):
            model_ori_ = models.Models['mlp_3layer_weight_perturb'](pert_weight=pert_weight, pert_bias=pert_bias, norm=norm).eval()
            model_ori_.load_state_dict(state_dict)
            model_ = BoundedModule(model_ori_, inputs, bound_opts={
                'sparse_intermediate_bounds': False, 'sparse_conv_intermediate_bounds': False, 'sparse_intermediate_bounds_with_ibp': False})
            model_.ptb = model_ori.ptb

            self.verify_bounds(model_, dummy_input, IBP=True, method='backward', forward_ret=forward_ret,
                        lb_name=lb_name + '_CROWN-IBP', ub_name=ub_name + '_CROWN-IBP')  # CROWN-IBP
            self.verify_bounds(model_, dummy_input, IBP=False, method='backward', forward_ret=forward_ret,
                        lb_name=lb_name + '_CROWN', ub_name=ub_name + '_CROWN')  # CROWN

        # Linf
        verify_model(pert_weight=True, pert_bias=True, norm=np.inf, lb_name='l_inf_weights_bias_lb', ub_name='l_inf_weights_bias_ub')
        verify_model(pert_weight=True, pert_bias=False, norm=np.inf, lb_name='l_inf_weights_lb', ub_name='l_inf_weights_ub')
        verify_model(pert_weight=False, pert_bias=True, norm=np.inf, lb_name='l_inf_bias_lb', ub_name='l_inf_bias_ub')

        # L2
        verify_model(pert_weight=True, pert_bias=True, norm=2, lb_name='l_2_weights_bias_lb', ub_name='l_2_weights_bias_ub')
        verify_model(pert_weight=True, pert_bias=False, norm=2, lb_name='l_2_weights_lb', ub_name='l_2_weights_ub')
        verify_model(pert_weight=False, pert_bias=True, norm=2, lb_name='l_2_bias_lb', ub_name='l_2_bias_ub')

        if self.generate:
            self.save()

if __name__ == '__main__':
    testcase = TestWeightPerturbation(generate=False)
    testcase.setUp()
    testcase.reference = testcase._to(testcase.reference, testcase.default_device)
    testcase.reference = testcase._to(testcase.reference, testcase.default_dtype)
    testcase.test_perturbation()
    testcase.test_training()


================================================
FILE: tests/testcase.py
================================================
import unittest
import random
import torch
import numpy as np

DEFAULT_DEVICE = 'cpu'
DEFAULT_DTYPE = torch.float32

class TestCase(unittest.TestCase):
    """Superclass for unit test cases in auto_LiRPA."""

    def __init__(self, methodName='runTest', seed=1, ref_name=None, generate=False,
                 device=DEFAULT_DEVICE, dtype=DEFAULT_DTYPE):

        super().__init__(methodName)

        self.addTypeEqualityFunc(np.ndarray, '_assert_array_equal')
        self.addTypeEqualityFunc(torch.Tensor, '_assert_tensor_equal')
        self.rtol = 1e-5
        self.atol = 1e-6
        self.default_dtype = dtype
        self.default_device = device
        set_default_dtype_device(dtype, device)
        self.set_seed(seed)
        data_path = 'data_64/' if dtype == torch.float64 else 'data/'
        self.ref_path = data_path + ref_name if ref_name else None
        self.generate = generate
        self.setUp()

    def set_seed(self, seed):
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        random.seed(seed)
        np.random.seed(seed)

    def setUp(self):
        """Load the reference result if it exists."""
        if self.generate:
            self.reference = None
        else:
            self.reference = torch.load(self.ref_path, weights_only=False) if self.ref_path else None
                        
    def save(self):
        """Save result for future comparison."""
        print('Saving result to', self.ref_path)
        torch.save(self.result, self.ref_path)

    def check(self):
        """Save or check the results.

        This function can be called at the end of each test.
        If `self.generate == True`, save results for future comparison;
        otherwise, compare the current results `self.result` with the loaded
        reference `self.reference`. Results are expected to be a list or tuple
        of `torch.Tensor` instances.
        """
        if self.generate:
            self.save()
        else:
            self.result = _to(
                self.result, device=self.default_device, dtype=self.default_dtype)
            self.reference = _to(
                self.reference, device=self.default_device, dtype=self.default_dtype)
            self._assert_equal(self.result, self.reference)

    def _assert_equal(self, a, b):
        assert type(a) == type(b)
        if isinstance(a, (list, tuple)):
            for a_, b_ in zip(a, b):
                self._assert_equal(a_, b_)
        else:
            self.assertEqual(a, b)

    def _assert_array_equal(self, a, b, msg=None):
        if not a.shape == b.shape:
            if msg is None:
                msg = f"Shapes are not equal: {a.shape} {b.shape}"
            raise self.failureException(msg)
        if not np.allclose(a, b, rtol=self.rtol, atol=self.atol):
            if msg is None:
                msg = f"Arrays are not equal:\n{a}\n{b}, max diff: {np.max(np.abs(a - b))}"
            raise self.failureException(msg)

    def _assert_tensor_equal(self, a, b, msg=None):
        if not a.shape == b.shape:
            if msg is None:
                msg = f"Shapes are not equal: {a.shape} {b.shape}"
            raise self.failureException(msg)
        if not torch.allclose(a, b, rtol=self.rtol, atol=self.atol):
            if msg is None:
                msg = f"Tensors are not equal:\n{a}\n{b}, max diff: {torch.max(torch.abs(a - b))}"
            raise self.failureException(msg)


def _to(obj, device=None, dtype=None, inplace=False):
    """ Move all tensors in the object to a specified dest
    (device or dtype). The inplace=True option is available for dict."""
    if obj is None:
        return obj
    elif isinstance(obj, torch.Tensor):
        return obj.to(device=device if device is not None else obj.device,
                      dtype=dtype if dtype is not None else obj.dtype)
    elif isinstance(obj, tuple):
        return tuple([_to(item, device=device, dtype=dtype) for item in obj])
    elif isinstance(obj, list):
        return [_to(item, device=device, dtype=dtype) for item in obj]
    elif isinstance(obj, dict):
        if inplace:
            for k, v in obj.items():
                obj[k] = _to(v, device=device, dtype=dtype, inplace=True)
            return obj
        else:
            return {k: _to(v, device=device, dtype=dtype) for k, v in obj.items()}
    else:
        raise NotImplementedError(f"Unsupported type: {type(obj)}")


def set_default_dtype_device(dtype=DEFAULT_DTYPE, device=DEFAULT_DEVICE):
    """Utility function to set default dtype and device."""
    torch.set_default_dtype(dtype)
    torch.set_default_device(torch.device(device))


__all__ = ['TestCase', 'DEFAULT_DEVICE',
           'DEFAULT_DTYPE', '_to', 'set_default_dtype_device']