Repository: graphcore/poptorch
Branch: sdk-release-3.4
Commit: c2a8b17762f1
Files: 537
Total size: 3.4 MB

Directory structure:
gitextract_ur1femal/

├── .ci/
│   └── view_component_trigger/
│       ├── Jenkinsfile
│       └── jobs.groovy
├── .clang-format
├── .clang-tidy
├── .github/
│   ├── CODEOWNERS
│   └── workflows/
│       └── apply_linters.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .pylintrc
├── .style.yapf
├── CMakeLists.txt
├── License.txt
├── MANIFEST.in
├── README.md
├── config.buildenv.py
├── docs/
│   ├── common/
│   │   ├── _static/
│   │   │   └── css/
│   │   │       └── custom_rtd.css
│   │   ├── conf.py
│   │   └── custom_dic
│   ├── poptorch_geometric/
│   │   ├── common/
│   │   │   └── conf.py
│   │   └── user_guide/
│   │       ├── index.rst
│   │       ├── installation.rst
│   │       ├── intro.rst
│   │       ├── legal.rst
│   │       ├── performance.rst
│   │       ├── reference.rst
│   │       ├── supported_operations.rst
│   │       └── tutorials.rst
│   └── user_guide/
│       ├── CMakeLists.txt
│       ├── api.py
│       ├── batching.rst
│       ├── buffers.py
│       ├── debugging.py
│       ├── debugging.rst
│       ├── device_iterations.py
│       ├── error_handling.py
│       ├── example.rst
│       ├── experimental.rst
│       ├── hostio_optimisation.rst
│       ├── index.rst
│       ├── inferenceModel.py
│       ├── installation.rst
│       ├── intro.rst
│       ├── legal.rst
│       ├── mnist.py
│       ├── overview.rst
│       ├── phased_execution.py
│       ├── pipeline_simple.py
│       ├── poptorch.conf
│       ├── poptorch_training_simple.py
│       ├── precompilation.py
│       ├── pytorch_to_poptorch.rst
│       ├── reference.rst
│       ├── replica_grouped_weights.py
│       ├── sumAnchorReturnType.py
│       ├── supported_ops.rst
│       └── trainingModel.py
├── examples/
│   ├── CMakeLists.txt
│   ├── bert_ipu.py
│   ├── lstm.py
│   ├── mnist.py
│   └── simple_adder.py
├── popart_compiler/
│   ├── CMakeLists.txt
│   ├── include/
│   │   └── popart_compiler/
│   │       ├── CodeletsCompilation.hpp
│   │       ├── Compiler.hpp
│   │       ├── CompilerOperationMacros.inc.hpp
│   │       ├── ManuallyAddedOperations.inc.hpp
│   │       ├── SupportedOperations.inc.hpp
│   │       └── Utils.hpp
│   ├── source/
│   │   ├── CodeletsCompilation.cpp
│   │   ├── Compiler.cpp
│   │   ├── CompilerImpl.cpp
│   │   ├── SessionOptions.cpp
│   │   ├── Utils.cpp
│   │   ├── custom_operations/
│   │   │   ├── Embedding.cpp
│   │   │   ├── FastGatherLastDim.cpp
│   │   │   ├── FastGatherLastDim.hpp
│   │   │   ├── FastGatherLastDimBwdCodelets.inc.cpp
│   │   │   ├── FastGatherLastDimFwdCodelets.inc.cpp
│   │   │   ├── HostOp.cpp
│   │   │   ├── TorchSoftplus.cpp
│   │   │   ├── TorchSoftplus.hpp
│   │   │   ├── UpsampleBilinear2d.cpp
│   │   │   └── UpsampleBilinear2dCodelets.inc.cpp
│   │   └── include/
│   │       └── popart_compiler/
│   │           ├── CompilerImpl.hpp
│   │           ├── CompilerOptions.hpp
│   │           ├── CustomOps.hpp
│   │           ├── MultiConvBuilder.hpp
│   │           └── SessionOptionsImpl.hpp
│   └── types/
│       └── include/
│           └── popart_compiler/
│               ├── CompilerTypes.hpp
│               └── PopartEnums.hpp
├── poptorch/
│   ├── CMakeLists.txt
│   ├── include/
│   │   └── poptorch/
│   │       ├── DispatchTracer.hpp
│   │       ├── InplaceOps.hpp
│   │       ├── LowerToPopart.hpp
│   │       ├── LowerToPopartFactories.hpp
│   │       ├── PoplarExecutable.hpp
│   │       ├── SessionOptionsParser.hpp
│   │       └── Utils.hpp
│   └── source/
│       ├── AddDetachOperations.cpp
│       ├── AddSubgraphConnectionNodes.cpp
│       ├── AliasProcessing.cpp
│       ├── CPUOffloadingCleanUp.cpp
│       ├── CompilerOps.cpp.inc
│       ├── ErrorOnUnsupportedAten.cpp
│       ├── FixupSetAvailableMemory.cpp
│       ├── GNNOptimizations.cpp
│       ├── GatherWithExpandedIndicesOptimization.cpp
│       ├── ImplicitCasting.cpp
│       ├── InplaceOps.cpp
│       ├── LowerToPopart.cpp
│       ├── LowerToPopartFactories.cpp
│       ├── OpBuilder.cpp
│       ├── OverlappedIO.cpp
│       ├── PopartCanonicalization.cpp
│       ├── PopartLateCanonicalization.cpp
│       ├── PoplarExecutable.cpp
│       ├── PoptorchStaticInit.hpp
│       ├── PoptorchSymbols.cpp
│       ├── PoptorchSymbols.hpp
│       ├── RemoveSurplusIdentityLosses.cpp
│       ├── RequiresGrad.cpp
│       ├── SessionOptionsParser.cpp
│       ├── Utils.cpp
│       ├── dispatch_tracer/
│       │   ├── CMakeLists.txt
│       │   ├── CommonHelperFunctions.cpp
│       │   ├── CommonHelperFunctions.hpp
│       │   ├── InplaceAliasMapper.cpp
│       │   ├── InplaceAliasMapper.hpp
│       │   ├── README.md
│       │   ├── RegisterAtenOverloads.cpp
│       │   ├── RegisterMetaOps.cpp.inc
│       │   ├── RegisterOptionalAtenOps.cpp.inc
│       │   ├── Tensor.cpp
│       │   ├── Tensor.hpp
│       │   ├── TypeInferenceHandler.cpp
│       │   ├── TypeInferenceHandler.hpp
│       │   ├── ValueMapper.cpp
│       │   ├── ValueMapper.hpp
│       │   └── dispatchers/
│       │       ├── IDispatch.cpp
│       │       ├── IDispatch.hpp
│       │       ├── JitDispatch.cpp
│       │       └── JitDispatch.hpp
│       ├── include/
│       │   └── poptorch/
│       │       ├── AliasProcessing.hpp
│       │       ├── CompilerOps.inc.hpp
│       │       ├── ImplicitCasting.hpp
│       │       ├── InplaceOpsPyTorch.hpp_nolint
│       │       ├── OpBuilder.hpp
│       │       ├── OverlappedIO.hpp
│       │       ├── PopartCanonicalization.hpp
│       │       ├── RequiresGrad.hpp
│       │       └── TypeAndConstantCanonicalization.hpp
│       ├── popart_canonicalization/
│       │   ├── ActivationOps.cpp
│       │   ├── ArithmeticOps.cpp
│       │   ├── AtenHandlers.gen.cpp
│       │   ├── BilinearOps.cpp
│       │   ├── BitwiseOps.cpp
│       │   ├── BlasOps.cpp
│       │   ├── ConstantOps.cpp
│       │   ├── ConvolutionOps.cpp
│       │   ├── CustomOps.cpp
│       │   ├── DistanceOps.cpp
│       │   ├── DropoutOps.cpp
│       │   ├── EinsumOp.cpp
│       │   ├── EinsumOp.hpp
│       │   ├── EmbeddingOps.cpp
│       │   ├── IndexOps.cpp
│       │   ├── LossOps.cpp
│       │   ├── NormalizationOps.cpp
│       │   ├── OtherOps.cpp
│       │   ├── PoolingOps.cpp
│       │   ├── PopartCanonicalizationUtils.cpp
│       │   ├── PopartCanonicalizationUtils.hpp
│       │   ├── PoptorchHandlers.gen.cpp
│       │   ├── PyGTorchScatterOps.cpp
│       │   ├── PyGTorchSplineConvOps.cpp
│       │   ├── RNNOps.cpp
│       │   ├── RandomSamplingOps.cpp
│       │   ├── ReduceOps.cpp
│       │   ├── ReshapeOps.cpp
│       │   ├── ScatterReduction.cpp
│       │   ├── ScatterReduction.hpp
│       │   ├── SliceOps.cpp
│       │   ├── SoftmaxOps.cpp
│       │   ├── TensorOps.cpp
│       │   └── pyg_torch_cluster/
│       │       ├── FpsOp.cpp
│       │       ├── GridOp.cpp
│       │       └── NearestOp.cpp
│       └── type_and_constant_canonicalization/
│           ├── AddListNumElements.cpp
│           ├── CanonicaliseConstants.cpp
│           ├── CastUnsupportedInputs.cpp
│           ├── CheckAndChangeOutputTypes.cpp
│           ├── EvaluateConstexprs.cpp
│           └── MakeConstantIntParams.cpp
├── poptorch_compiler/
│   └── pytorch_bridge/
│       ├── CMakeLists.txt
│       ├── IpuSession.cpp
│       └── include/
│           └── pytorch_bridge/
│               ├── CompilerOptions.hpp
│               ├── CompilerTypes.hpp
│               ├── DebugInfo.hpp
│               └── IpuSession.hpp
├── poptorch_err/
│   ├── CMakeLists.txt
│   ├── exception_info/
│   │   └── poptorch_err/
│   │       └── ExceptionInfo.hpp
│   ├── include/
│   │   └── poptorch_err/
│   │       └── ExceptionHandling.hpp
│   └── source/
│       └── ExceptionHandling.cpp
├── poptorch_geometric/
│   ├── CMakeLists.txt
│   ├── License.txt
│   ├── MANIFEST.in
│   ├── README.md
│   ├── config.buildenv.py
│   ├── poptorch_geometric_third_party_licenses.txt
│   ├── pyproject.toml
│   ├── python/
│   │   ├── CMakeLists.txt
│   │   ├── __init__.py
│   │   ├── cluster_loader.py
│   │   ├── collate.py
│   │   ├── common.py
│   │   ├── dataloader.py
│   │   ├── fixed_size_options.py
│   │   ├── masker.py
│   │   ├── neighbor_loader.py
│   │   ├── ops/
│   │   │   ├── __init__.py
│   │   │   ├── aggregation_base.py
│   │   │   ├── cluster_gcn_conv.py
│   │   │   ├── hetero_linear.py
│   │   │   ├── instance_norm.py
│   │   │   ├── knn.py
│   │   │   ├── knn_graph.py
│   │   │   ├── knn_interpolate.py
│   │   │   ├── mf_conv.py
│   │   │   └── radius.py
│   │   ├── override.py
│   │   ├── py.typed
│   │   ├── pyg_cluster_loader.py
│   │   ├── pyg_collate.py
│   │   ├── pyg_dataloader.py
│   │   ├── stream_packing_sampler.py
│   │   ├── types.py
│   │   └── utils.py
│   ├── requirements.txt
│   ├── setup.cfg
│   └── setup.py
├── poptorch_logging/
│   ├── CMakeLists.txt
│   ├── include/
│   │   └── poptorch_logging/
│   │       ├── Error.hpp
│   │       ├── Logging.hpp
│   │       ├── LoggingLight.hpp
│   │       └── Tracepoint.hpp
│   └── source/
│       ├── Error.cpp
│       ├── Logging.cpp
│       └── Tracepoint.cpp
├── poptorch_third_party_licenses.txt
├── pyproject.toml
├── python/
│   ├── CMakeLists.txt
│   ├── __init__.py
│   ├── _args_parser.py
│   ├── _dataloader.py
│   ├── _impl.py
│   ├── _logging.py
│   ├── _optimizer_attributes.py
│   ├── _options_config.py
│   ├── _options_impl.py
│   ├── _poplar_executor.py
│   ├── _poptorch_data.py
│   ├── _printing.py
│   ├── _utils.py
│   ├── enums.py
│   ├── ops.py
│   ├── optim.py
│   ├── options.py
│   ├── poptorch.cpp
│   ├── profiling.py
│   ├── py.typed
│   └── testing.py
├── requirements.txt
├── scripts/
│   ├── PopAtenHandlers.py
│   ├── PopParse.py
│   ├── PopTorchHandlers.py
│   ├── __init__.py
│   ├── apply_linters.py
│   ├── check_spelling.py
│   ├── create_buildenv.py
│   ├── docs_build.py
│   ├── download_external_datasets.py
│   ├── enable.sh.in
│   ├── generate_poppyg_package.py
│   ├── generate_python_package.py
│   ├── popgen/
│   │   ├── __init__.py
│   │   ├── api.py
│   │   ├── generator.py
│   │   ├── helpers.py
│   │   ├── onnx.py
│   │   ├── operatorfactory.py
│   │   ├── poptorch.py
│   │   ├── registry.py
│   │   ├── transform.py
│   │   └── values.py
│   ├── set_version.py
│   └── utils/
│       └── _utils.py
├── setup.cfg
├── setup.py
├── tests/
│   ├── .gitignore
│   ├── CMakeLists.txt
│   ├── activations_test.py
│   ├── attach_detach_test.py
│   ├── attach_detach_wait_for_ipu_test.py
│   ├── batching_test.py
│   ├── bert_small_and_medium_test.py
│   ├── blas_test.py
│   ├── bool_support_test.py
│   ├── buffers_test.py
│   ├── conftest.py
│   ├── convs_test.py
│   ├── cpp/
│   │   ├── CMakeLists.txt
│   │   └── GNNOptimizationsTest.cpp
│   ├── cpu_op_test.py
│   ├── ctc_decoder_test.py
│   ├── custom_loss_test.py
│   ├── custom_ops/
│   │   ├── CMakeLists.txt
│   │   ├── custom_add_scalar_op.cpp
│   │   ├── custom_add_scalar_vec_op.cpp
│   │   ├── custom_add_vec_scalar_mul_op.cpp
│   │   ├── custom_cube_op.cpp
│   │   ├── custom_leaky_relu_op.cpp
│   │   ├── custom_many_attribute_op.cpp
│   │   ├── custom_reduce_op.cpp
│   │   └── custom_three_input_reduce_op.cpp
│   ├── custom_ops_attributes_test.py
│   ├── custom_ops_test.py
│   ├── dataloader_test.py
│   ├── debug_tensors_test.py
│   ├── distance_ops_test.py
│   ├── exception_test.py
│   ├── fine_tuning_test.py
│   ├── functional_test.py
│   ├── generate_test_file.py
│   ├── gnn/
│   │   ├── .gitignore
│   │   ├── benchgnn/
│   │   │   ├── README.md
│   │   │   ├── benchgnn.py
│   │   │   ├── datasets.py
│   │   │   ├── models.py
│   │   │   ├── requirements.txt
│   │   │   └── utils.py
│   │   ├── benchgnn_ops/
│   │   │   ├── README.md
│   │   │   ├── benchgnn_ops.py
│   │   │   ├── builder.py
│   │   │   ├── example_configs/
│   │   │   │   ├── common.yaml
│   │   │   │   ├── scatter_testcase1.yaml
│   │   │   │   └── scatter_testcase2.yaml
│   │   │   ├── metrics.py
│   │   │   ├── ops.py
│   │   │   └── requirements.txt
│   │   ├── conftest.py
│   │   ├── nn/
│   │   │   ├── aggr/
│   │   │   │   ├── aggr_utils.py
│   │   │   │   ├── conftest.py
│   │   │   │   ├── test_attention.py
│   │   │   │   ├── test_basic.py
│   │   │   │   ├── test_deep_sets.py
│   │   │   │   ├── test_equilibrium.py
│   │   │   │   ├── test_fused.py
│   │   │   │   ├── test_gmt.py
│   │   │   │   ├── test_gru.py
│   │   │   │   ├── test_lstm.py
│   │   │   │   ├── test_mlp_aggr.py
│   │   │   │   ├── test_multi.py
│   │   │   │   ├── test_quantile.py
│   │   │   │   ├── test_scaler.py
│   │   │   │   ├── test_set2set.py
│   │   │   │   ├── test_set_transformer.py
│   │   │   │   └── test_sort.py
│   │   │   ├── conftest.py
│   │   │   ├── conv/
│   │   │   │   ├── conv_utils.py
│   │   │   │   ├── test_agnn_conv.py
│   │   │   │   ├── test_antisymmetric_conv.py
│   │   │   │   ├── test_appnp.py
│   │   │   │   ├── test_arma_conv.py
│   │   │   │   ├── test_cg_conv.py
│   │   │   │   ├── test_cheb_conv.py
│   │   │   │   ├── test_cluster_gcn_conv.py
│   │   │   │   ├── test_dna_conv.py
│   │   │   │   ├── test_edge_conv.py
│   │   │   │   ├── test_eg_conv.py
│   │   │   │   ├── test_fa_conv.py
│   │   │   │   ├── test_feast_conv.py
│   │   │   │   ├── test_film_conv.py
│   │   │   │   ├── test_gat_conv.py
│   │   │   │   ├── test_gated_graph_conv.py
│   │   │   │   ├── test_gatv2_conv.py
│   │   │   │   ├── test_gcn2_conv.py
│   │   │   │   ├── test_gcn_conv.py
│   │   │   │   ├── test_gen_conv.py
│   │   │   │   ├── test_general_conv.py
│   │   │   │   ├── test_gin_conv.py
│   │   │   │   ├── test_gmm_conv.py
│   │   │   │   ├── test_gps_conv.py
│   │   │   │   ├── test_graph_conv.py
│   │   │   │   ├── test_gravnet_conv.py
│   │   │   │   ├── test_han_conv.py
│   │   │   │   ├── test_heat_conv.py
│   │   │   │   ├── test_hetero_conv.py
│   │   │   │   ├── test_hgt_conv.py
│   │   │   │   ├── test_hypergraph_conv.py
│   │   │   │   ├── test_le_conv.py
│   │   │   │   ├── test_lg_conv.py
│   │   │   │   ├── test_mf_conv.py
│   │   │   │   ├── test_nn_conv.py
│   │   │   │   ├── test_pan_conv.py
│   │   │   │   ├── test_pdn_conv.py
│   │   │   │   ├── test_pna_conv.py
│   │   │   │   ├── test_point_conv.py
│   │   │   │   ├── test_point_gnn_conv.py
│   │   │   │   ├── test_point_transformer_conv.py
│   │   │   │   ├── test_ppf_conv.py
│   │   │   │   ├── test_res_gated_graph_conv.py
│   │   │   │   ├── test_rgat_conv.py
│   │   │   │   ├── test_rgcn_conv.py
│   │   │   │   ├── test_sage_conv.py
│   │   │   │   ├── test_sg_conv.py
│   │   │   │   ├── test_signed_conv.py
│   │   │   │   ├── test_simple_conv.py
│   │   │   │   ├── test_spline_conv.py
│   │   │   │   ├── test_ssg_conv.py
│   │   │   │   ├── test_supergat_conv.py
│   │   │   │   ├── test_tag_conv.py
│   │   │   │   ├── test_transformer_conv.py
│   │   │   │   ├── test_wl_conv.py
│   │   │   │   ├── test_wl_conv_continuous.py
│   │   │   │   └── test_x_conv.py
│   │   │   ├── dense/
│   │   │   │   ├── dense_utils.py
│   │   │   │   └── test_convs.py
│   │   │   ├── functional/
│   │   │   │   ├── test_bro.py
│   │   │   │   └── test_gini.py
│   │   │   ├── kge/
│   │   │   │   ├── kge_utils.py
│   │   │   │   ├── test_complex.py
│   │   │   │   ├── test_distmult.py
│   │   │   │   ├── test_rotate.py
│   │   │   │   └── test_transe.py
│   │   │   ├── nn_utils.py
│   │   │   ├── norm/
│   │   │   │   ├── norm_utils.py
│   │   │   │   ├── test_batch_norm.py
│   │   │   │   ├── test_diff_group_norm.py
│   │   │   │   ├── test_graph_norm.py
│   │   │   │   ├── test_graph_size_norm.py
│   │   │   │   ├── test_instance_norm.py
│   │   │   │   ├── test_layer_norm.py
│   │   │   │   ├── test_mean_subtraction_norm.py
│   │   │   │   ├── test_msg_norm.py
│   │   │   │   └── test_pair_norm.py
│   │   │   ├── pool/
│   │   │   │   ├── pool_utils.py
│   │   │   │   ├── test_asap.py
│   │   │   │   ├── test_avg_pool.py
│   │   │   │   ├── test_consecutive.py
│   │   │   │   ├── test_decimation.py
│   │   │   │   ├── test_edge_pool.py
│   │   │   │   ├── test_fps.py
│   │   │   │   ├── test_glob.py
│   │   │   │   ├── test_graclus.py
│   │   │   │   ├── test_max_pool.py
│   │   │   │   ├── test_mem_pool.py
│   │   │   │   ├── test_pan_pool.py
│   │   │   │   ├── test_pool_knn.py
│   │   │   │   ├── test_radius.py
│   │   │   │   ├── test_sag_pool.py
│   │   │   │   ├── test_select_topk.py
│   │   │   │   ├── test_topk_pool.py
│   │   │   │   └── test_voxel_grid.py
│   │   │   ├── test_linear.py
│   │   │   ├── test_loss.py
│   │   │   ├── test_mish.py
│   │   │   ├── test_sequential.py
│   │   │   └── unpool/
│   │   │       └── test_interpolate.py
│   │   ├── ops/
│   │   │   ├── test_knn.py
│   │   │   ├── test_knn_graph.py
│   │   │   ├── test_knn_interpolate.py
│   │   │   ├── test_nearest.py
│   │   │   ├── test_radius_op.py
│   │   │   ├── test_spline_conv_ops.py
│   │   │   └── test_to_dense_batch.py
│   │   ├── test_basic_gnn.py
│   │   ├── test_cluster_loader.py
│   │   ├── test_collate.py
│   │   ├── test_dataloader.py
│   │   ├── test_encoding.py
│   │   ├── test_fixed_size_options.py
│   │   ├── test_masker.py
│   │   ├── test_model_args.py
│   │   ├── test_neighbor_loader.py
│   │   ├── test_register_custom_args.py
│   │   ├── test_stream_packing_sampler.py
│   │   └── utils.py
│   ├── grouping_scatters_gathers_test.py
│   ├── gru_test.py
│   ├── half_float_test.py
│   ├── half_test.py
│   ├── helpers.py
│   ├── hooks_test.py
│   ├── if_test.py
│   ├── index_ops_test.py
│   ├── inplace_test.py
│   ├── inputs_test.py
│   ├── io_performance_test.py
│   ├── ipu_print_tensor_test.py
│   ├── loop_test.py
│   ├── losses_test.py
│   ├── lstm_test.py
│   ├── math_ops_test.py
│   ├── misc_nn_layers_test.py
│   ├── misc_test.py
│   ├── multiconv_test.py
│   ├── non_contiguous_tensors_test.py
│   ├── norms_test.py
│   ├── ops_test.py
│   ├── optimizers_test.py
│   ├── options_test.py
│   ├── other_ops_test.py
│   ├── outputs_test.py
│   ├── overlapped_io_test.py
│   ├── phased_execution_test.py
│   ├── pipelining_test.py
│   ├── pooling_and_padding_test.py
│   ├── popdist_test.py
│   ├── poplar_executor_test.py
│   ├── precompilation_test.py
│   ├── pyg_torch_scatter_test.py
│   ├── random_sampling_test.py
│   ├── reduce_ops_test.py
│   ├── replicated_graph_test.py
│   ├── requires_grad_test.py
│   ├── rnn_test.py
│   ├── sharding_test.py
│   ├── slice_test.py
│   ├── tensor_ops_test.py
│   ├── test_doc_urls.py
│   ├── test_perf_counters.py
│   ├── timeout_handler.py
│   ├── torch_nn_test.py
│   ├── torchvision_inference_test.py
│   ├── type_support_test.py
│   └── weights_writing_test.py
└── version.json

================================================
FILE CONTENTS
================================================

================================================
FILE: .ci/view_component_trigger/Jenkinsfile
================================================
@Library('sw-jenkins-library@view-component-trigger') _

viewComponentTrigger(jobsFilepath: '.ci/view_component_trigger/jobs.groovy')


================================================
FILE: .ci/view_component_trigger/jobs.groovy
================================================
[
    [
         job: '/poptorch/poptorch_pr',
         parameters: [
             string(name: 'GCCI_BRANCH', value: 'mk2-main')
         ]
    ],
]


================================================
FILE: .clang-format
================================================
Language: Cpp
BasedOnStyle: llvm


================================================
FILE: .clang-tidy
================================================
Checks: '*, -abseil*, -altera*, -android*, -cppcoreguidelines*, -cert*, -modernize*, -boost*, -google*, -fuchsia*, -hicpp*, -objc*, -llvm*, -bugprone-exception-escape, -readability-uppercase-literal-suffix, -misc-non-private-member-variables-in-classes, -fuchsia-default-arguments-declarations, -fuchsia-default-arguments-calls, -readability-magic-numbers, -fuchsia-overloaded-operator, -performance-noexcept-move-constructor, -concurrency-mt-unsafe, -readability-function-cognitive-complexity, -misc-throw-by-value-catch-by-reference, -misc-no-recursion, -bugprone-narrowing-conversions, -bugprone-easily-swappable-parameters, -readability-make-member-function-const, -readability-use-anyofallof, -readability-identifier-length,-misc-confusable-identifiers,-bugprone-reserved-identifier,-misc-unused-using-decls'
WarningsAsErrors: '*'
HeaderFilterRegex: ''
AnalyzeTemporaryDtors: false
CheckOptions:
  - key:             readability-identifier-naming.NamespaceCase
    value:           lower_case
  - key:             readability-identifier-naming.ClassCase
    value:           CamelCase
  - key:             readability-identifier-naming.StructCase
    value:           CamelCase
  - key:             readability-identifier-naming.PrivateMemberPrefix
    value:           _
  - key:             readability-identifier-naming.ProtectedMemberPrefix
    value:           _
  - key:             readability-identifier-naming.MemberCase
    value:           lower_case
  - key:             readability-identifier-naming.StructCase
    value:           CamelCase
  - key:             readability-identifier-naming.MethodCase
    value:           camelBack
  - key:             readability-identifier-naming.FunctionCase
    value:           camelBack
  - key:             readability-identifier-naming.VariableCase
    value:           lower_case
  - key:             misc-throw-by-value-catch-by-reference.MaxSize
    value:           '8'


================================================
FILE: .github/CODEOWNERS
================================================
* @Software-GCAI/poptorch


================================================
FILE: .github/workflows/apply_linters.yml
================================================
name: apply_linters.py git trailer check

on:
  push:
    branches: [mk2-main]
  pull_request:
    branches: [mk2-main]

jobs:
  apply_linters:
    timeout-minutes: 10
    name: apply_linters.py git trailer check
    runs-on: [self-hosted, linux]
    steps:
    - uses: actions/checkout@v3
      with:
        # 0 indicates fetch history for all branches and tags.
        # By default the checkout action only checks out the PR
        # ref. However apply_linters.py needs run git commands
        # that reference origin/mk2-main.
        fetch-depth: 0
        # Checkout the head instead of the merge commit
        ref: ${{ github.event.pull_request.head.sha }}
    - name: Verify most recent commit's git trailer
      run: python scripts/apply_linters.py --check-trailer


================================================
FILE: .gitignore
================================================
build
__pycache__
.linters
.cache
.vscode
test_data


================================================
FILE: .pre-commit-config.yaml
================================================
repos:
-   repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v2.3.0
    hooks:
    -   id: check-yaml
    -   id: end-of-file-fixer
    -   id: check-merge-conflict
    -   id: trailing-whitespace
-   repo: local
    hooks:
    -   id: apply_linters
        name: apply_linters
        entry: scripts/apply_linters.py
        language: python
        args: [-a, --add-trailer-on-success, --debug, --git-strategy=pre-commit]
        additional_dependencies: [pyyaml==6.0.0, packaging==23.0.0, colorama==0.4.6]
        # For the git trailer to be correct apply_linters.py must be applied to all the files.


================================================
FILE: .pylintrc
================================================
[MASTER]
# Prevent pylint from incorrectly reporting 'has no member' for C modules
# by allowing them to be loaded (does not happen by default due to security
# concerns)
extension-pkg-whitelist=numpy,torch

# Specify a configuration file.
#rcfile=

# Python code to execute, usually for sys.path manipulation such as
# pygtk.require().
# This adds <root_dir>/tests to the path (Needed for tests in subfolders to find helpers.py)
init-hook="from pylint.config import find_pylintrc; import os, sys; sys.path.append(os.path.join(os.path.dirname(os.path.realpath(find_pylintrc())), 'tests'))"

# Profiled execution.
profile=no

# Add files or directories to the blacklist. They should be base names, not
# paths.
ignore=git

# Pickle collected data for later comparisons.
persistent=yes

# List of plugins (as comma separated values of python modules names) to load,
# usually to register additional checkers.
load-plugins=


[MESSAGES CONTROL]

# Enable the message, report, category or checker with the given id(s). You can
# either give multiple identifier separated by comma (,) or put this option
# multiple time. See also the "--disable" option for examples.
enable=indexing-exception,old-raise-syntax

# Disable the message, report, category or checker with the given id(s). You
# can either give multiple identifiers separated by comma (,) or put this
# option multiple times (only on the command line, not in the configuration
# file where it should appear only once).You can also use "--disable=all" to
# disable everything first and then reenable specific checks. For example, if
# you want to run only the similarities checker, you can use "--disable=all
# --enable=similarities". If you want to run only the classes checker, but have
# no Warning level messages displayed, use"--disable=all --enable=classes
# --disable=W"
disable=global-statement,no-self-use,import-error,missing-docstring,invalid-name,redefined-builtin,too-few-public-methods,redefined-outer-name,no-name-in-module,not-callable,too-many-instance-attributes,too-many-branches,too-many-locals,too-many-arguments,too-many-statements

# not-callable leads to false positives due to PyTorch's tensor.py


# Set the cache size for astng objects.
cache-size=500


[REPORTS]

# Set the output format. Available formats are text, parseable, colorized, msvs
# (visual studio) and html. You can also give a reporter class, eg
# mypackage.mymodule.MyReporterClass.
output-format=text

# Put messages in a separate file for each module / package specified on the
# command line instead of printing them on stdout. Reports (if any) will be
# written in a file name "pylint_global.[txt|html]".
files-output=no

# Tells whether to display a full report or only the messages
reports=no

# Python expression which should return a note less than 10 (10 is the highest
# note). You have access to the variables errors warning, statement which
# respectively contain the number of errors / warnings messages and the total
# number of statements analyzed. This is used by the global evaluation report
# (RP0004).
evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)

# Add a comment according to your evaluation note. This is used by the global
# evaluation report (RP0004).
comment=no

# Template used to display messages. This is a python new-style format string
# used to format the message information. See doc for all details
#msg-template=


[TYPECHECK]

# Tells whether missing members accessed in mixin class should be ignored. A
# mixin class is detected if its name ends with "mixin" (case insensitive).
ignore-mixin-members=yes

# List of classes names for which member attributes should not be checked
# (useful for classes with attributes dynamically set).
# Workaround for pylint incorrectly reporting 'has no member' for torch
# https://github.com/pytorch/pytorch/issues/701
ignored-classes=SQLObject,torch
ignored-modules=torch

# When zope mode is activated, add a predefined set of Zope acquired attributes
# to generated-members.
zope=no

# List of members which are set dynamically and missed by pylint inference
# system, and so shouldn't trigger E0201 when accessed. Python regular
# expressions are accepted.
generated-members=REQUEST,acl_users,aq_parent

# List of decorators that create context managers from functions, such as
# contextlib.contextmanager.
contextmanager-decorators=contextlib.contextmanager,contextlib2.contextmanager


[VARIABLES]

# Tells whether we should check for unused import in __init__ files.
init-import=no

# A regular expression matching the beginning of the name of dummy variables
# (i.e. not used).
dummy-variables-rgx=^\*{0,2}(_$|unused_|dummy_)

# List of additional names supposed to be defined in builtins. Remember that
# you should avoid to define new builtins when possible.
additional-builtins=


[BASIC]

# Required attributes for module, separated by a comma
required-attributes=

# List of builtins function names that should not be used, separated by a comma
bad-functions=apply,input,reduce


# Disable the report(s) with the given id(s).
# All non-Google reports are disabled by default.
disable-report=R0001,R0002,R0003,R0004,R0101,R0102,R0201,R0202,R0220,R0401,R0402,R0701,R0801,R0901,R0902,R0903,R0904,R0911,R0912,R0913,R0914,R0915,R0921,R0922,R0923

# Regular expression which should only match correct module names
module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$

# Regular expression which should only match correct module level names
const-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$

# Regular expression which should only match correct class names
class-rgx=^_?[A-Z][a-zA-Z0-9]*$

# Regular expression which should only match correct function names
function-rgx=^(?:(?P<camel_case>_?[A-Z][a-zA-Z0-9]*)|(?P<snake_case>_?[a-z][a-z0-9_]*))$

# Regular expression which should only match correct method names
method-rgx=^(?:(?P<exempt>__[a-z0-9_]+__|next)|(?P<camel_case>_{0,2}[A-Z][a-zA-Z0-9]*)|(?P<snake_case>_{0,2}[a-z][a-z0-9_]*))$

# Regular expression which should only match correct instance attribute names
attr-rgx=^_{0,2}[a-z][a-z0-9_]*$

# Regular expression which should only match correct argument names
argument-rgx=^[a-z][a-z0-9_]*$

# Regular expression which should only match correct variable names
variable-rgx=^[a-z][a-z0-9_]*$

# Regular expression which should only match correct attribute names in class
# bodies
class-attribute-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$

# Regular expression which should only match correct list comprehension /
# generator expression variable names
inlinevar-rgx=^[a-z][a-z0-9_]*$

# Good variable names which should always be accepted, separated by a comma
good-names=main,_

# Bad variable names which should always be refused, separated by a comma
bad-names=

# Regular expression which should only match function or class names that do
# not require a docstring.
no-docstring-rgx=(__.*__|main)

# Minimum line length for functions/classes that require docstrings, shorter
# ones are exempt.
docstring-min-length=10


[FORMAT]

# Maximum number of characters on a single line.
max-line-length=80

# Regexp for a line that is allowed to be longer than the limit.
ignore-long-lines=(?x)
  (^\s*(import|from)\s
   |\$Id:\s\/\/depot\/.+#\d+\s\$
   |^[a-zA-Z_][a-zA-Z0-9_]*\s*=\s*("[^"]\S+"|'[^']\S+')
   |^\s*\#\ LINT\.ThenChange
   |^[^#]*\#\ type:\ [a-zA-Z_][a-zA-Z0-9_.,[\] ]*$
   |pylint
   |"""
   |\#
   |lambda
   |(https?|ftp):)

# Allow the body of an if to be on the same line as the test if there is no
# else.
single-line-if-stmt=y

# List of optional constructs for which whitespace checking is disabled
no-space-check=trailing-comma

# Maximum number of lines in a module
max-module-lines=99999

# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
# tab).
indent-string='    '


[SIMILARITIES]

# Minimum lines number of a similarity.
min-similarity-lines=4

# Ignore comments when computing similarities.
ignore-comments=yes

# Ignore docstrings when computing similarities.
ignore-docstrings=yes

# Ignore imports when computing similarities.
ignore-imports=no


[MISCELLANEOUS]

# List of note tags to take in consideration, separated by a comma.
notes=


[IMPORTS]

# Deprecated modules which should not be used, separated by a comma
deprecated-modules=regsub,TERMIOS,Bastion,rexec,sets

# Create a graph of every (i.e. internal and external) dependencies in the
# given file (report RP0402 must not be disabled)
import-graph=

# Create a graph of external dependencies in the given file (report RP0402 must
# not be disabled)
ext-import-graph=

# Create a graph of internal dependencies in the given file (report RP0402 must
# not be disabled)
int-import-graph=


[CLASSES]

# List of interface methods to ignore, separated by a comma. This is used for
# instance to not check methods defines in Zope's Interface base class.
ignore-iface-methods=isImplementedBy,deferred,extends,names,namesAndDescriptions,queryDescriptionFor,getBases,getDescriptionFor,getDoc,getName,getTaggedValue,getTaggedValueTags,isEqualOrExtendedBy,setTaggedValue,isImplementedByInstancesOf,adaptWith,is_implemented_by

# List of method names used to declare (i.e. assign) instance attributes.
defining-attr-methods=__init__,__new__,setUp

# List of valid names for the first argument in a class method.
valid-classmethod-first-arg=cls,class_

# List of valid names for the first argument in a metaclass class method.
valid-metaclass-classmethod-first-arg=mcs


[DESIGN]

# Maximum number of arguments for function / method
max-args=5

# Argument names that match this expression will be ignored. Default to name
# with leading underscore
ignored-argument-names=_.*

# Maximum number of locals for function / method body
max-locals=15

# Maximum number of return / yield for function / method body
max-returns=6

# Maximum number of branch for function / method body
max-branches=12

# Maximum number of statements in function / method body
max-statements=50

# Maximum number of parents for a class (see R0901).
max-parents=7

# Maximum number of attributes for a class (see R0902).
max-attributes=7

# Minimum number of public methods for a class (see R0903).
min-public-methods=2

# Maximum number of public methods for a class (see R0904).
max-public-methods=20


[EXCEPTIONS]

# Exceptions that will emit a warning when being caught. Defaults to
# "Exception"
overgeneral-exceptions=Exception,StandardError,BaseException


[AST]

# Maximum line length for lambdas
short-func-length=1

# List of module members that should be marked as deprecated.
# All of the string functions are listed in 4.1.4 Deprecated string functions
# in the Python 2.4 docs.
deprecated-members=string.atof,string.atoi,string.atol,string.capitalize,string.expandtabs,string.find,string.rfind,string.index,string.rindex,string.count,string.lower,string.split,string.rsplit,string.splitfields,string.join,string.joinfields,string.lstrip,string.rstrip,string.strip,string.swapcase,string.translate,string.upper,string.ljust,string.rjust,string.center,string.zfill,string.replace,sys.exitfunc


[DOCSTRING]

# List of exceptions that do not need to be mentioned in the Raises section of
# a docstring.
ignore-exceptions=AssertionError,NotImplementedError,StopIteration,TypeError


[TOKENS]

# Number of spaces of indent required when the last token on the preceding line
# is an open (, [, or {.
indent-after-paren=4


[GOOGLE LINES]

# Regexp for a proper copyright notice.
copyright=Copyright \d{4} The TensorFlow Authors\. +All [Rr]ights [Rr]eserved\.


================================================
FILE: .style.yapf
================================================
[style]
# Align closing bracket with visual indentation.
align_closing_bracket_with_visual_indent=True

# Allow dictionary keys to exist on multiple lines. For example:
#
#   x = {
#       ('this is the first element of a tuple',
#        'this is the second element of a tuple'):
#            value,
#   }
allow_multiline_dictionary_keys=False

# Allow lambdas to be formatted on more than one line.
allow_multiline_lambdas=False

# Allow splits before the dictionary value.
allow_split_before_dict_value=True

# Number of blank lines surrounding top-level function and class
# definitions.
blank_lines_around_top_level_definition=2

# Insert a blank line before a class-level docstring.
blank_line_before_class_docstring=False

# Insert a blank line before a module docstring.
blank_line_before_module_docstring=False

# Insert a blank line before a 'def' or 'class' immediately nested
# within another 'def' or 'class'. For example:
#
#   class Foo:
#                      # <------ this blank line
#     def method():
#       ...
blank_line_before_nested_class_or_def=False

# Do not split consecutive brackets. Only relevant when
# dedent_closing_brackets is set. For example:
#
#    call_func_that_takes_a_dict(
#        {
#            'key1': 'value1',
#            'key2': 'value2',
#        }
#    )
#
# would reformat to:
#
#    call_func_that_takes_a_dict({
#        'key1': 'value1',
#        'key2': 'value2',
#    })
coalesce_brackets=False

# The column limit.
column_limit=79

# The style for continuation alignment. Possible values are:
#
# - SPACE: Use spaces for continuation alignment. This is default behavior.
# - FIXED: Use fixed number (CONTINUATION_INDENT_WIDTH) of columns
#   (ie: CONTINUATION_INDENT_WIDTH/INDENT_WIDTH tabs) for continuation
#   alignment.
# - LESS: Slightly left if cannot vertically align continuation lines with
#   indent characters.
# - VALIGN-RIGHT: Vertically align continuation lines with indent
#   characters. Slightly right (one more indent character) if cannot
#   vertically align continuation lines with indent characters.
#
# For options FIXED, and VALIGN-RIGHT are only available when USE_TABS is
# enabled.
continuation_align_style=SPACE

# Indent width used for line continuations.
continuation_indent_width=4

# Put closing brackets on a separate line, dedented, if the bracketed
# expression can't fit in a single line. Applies to all kinds of brackets,
# including function definitions and calls. For example:
#
#   config = {
#       'key1': 'value1',
#       'key2': 'value2',
#   }        # <--- this bracket is dedented and on a separate line
#
#   time_series = self.remote_client.query_entity_counters(
#       entity='dev3246.region1',
#       key='dns.query_latency_tcp',
#       transform=Transformation.AVERAGE(window=timedelta(seconds=60)),
#       start_ts=now()-timedelta(days=3),
#       end_ts=now(),
#   )        # <--- this bracket is dedented and on a separate line
dedent_closing_brackets=False

# Disable the heuristic which places each list element on a separate line
# if the list is comma-terminated.
disable_ending_comma_heuristic=False

# Place each dictionary entry onto its own line.
each_dict_entry_on_separate_line=True

# The regex for an i18n comment. The presence of this comment stops
# reformatting of that line, because the comments are required to be
# next to the string they translate.
i18n_comment=

# The i18n function call names. The presence of this function stops
# reformattting on that line, because the string it has cannot be moved
# away from the i18n comment.
i18n_function_call=

# Indent the dictionary value if it cannot fit on the same line as the
# dictionary key. For example:
#
#   config = {
#       'key1':
#           'value1',
#       'key2': value1 +
#               value2,
#   }
indent_dictionary_value=False

# The number of columns to use for indentation.
indent_width=4

# Join short lines into one line. E.g., single line 'if' statements.
join_multiple_lines=True

# Do not include spaces around selected binary operators. For example:
#
#   1 + 2 * 3 - 4 / 5
#
# will be formatted as follows when configured with *,/:
#
#   1 + 2*3 - 4/5
#
no_spaces_around_selected_binary_operators=set([])

# Use spaces around default or named assigns.
spaces_around_default_or_named_assign=False

# Use spaces around the power operator.
spaces_around_power_operator=False

# The number of spaces required before a trailing comment.
spaces_before_comment=2

# Insert a space between the ending comma and closing bracket of a list,
# etc.
space_between_ending_comma_and_closing_bracket=True

# Split before arguments
split_all_comma_separated_values=False

# Split before arguments if the argument list is terminated by a
# comma.
split_arguments_when_comma_terminated=False

# Set to True to prefer splitting before '&', '|' or '^' rather than
# after.
split_before_bitwise_operator=True

# Split before the closing bracket if a list or dict literal doesn't fit on
# a single line.
split_before_closing_bracket=True

# Split before a dictionary or set generator (comp_for). For example, note
# the split before the 'for':
#
#   foo = {
#       variable: 'Hello world, have a nice day!'
#       for variable in bar if variable != 42
#   }
split_before_dict_set_generator=True

# Split after the opening paren which surrounds an expression if it doesn't
# fit on a single line.
split_before_expression_after_opening_paren=False

# If an argument / parameter list is going to be split, then split before
# the first argument.
split_before_first_argument=False

# Set to True to prefer splitting before 'and' or 'or' rather than
# after.
split_before_logical_operator=True

# Split named assignments onto individual lines.
split_before_named_assigns=True

# Set to True to split list comprehensions and generators that have
# non-trivial expressions and multiple clauses before each of these
# clauses. For example:
#
#   result = [
#       a_long_var + 100 for a_long_var in xrange(1000)
#       if a_long_var % 10]
#
# would reformat to something like:
#
#   result = [
#       a_long_var + 100
#       for a_long_var in xrange(1000)
#       if a_long_var % 10]
split_complex_comprehension=False

# The penalty for splitting right after the opening bracket.
split_penalty_after_opening_bracket=30

# The penalty for splitting the line after a unary operator.
split_penalty_after_unary_operator=10000

# The penalty for splitting right before an if expression.
split_penalty_before_if_expr=0

# The penalty of splitting the line around the '&', '|', and '^'
# operators.
split_penalty_bitwise_operator=300

# The penalty for splitting a list comprehension or generator
# expression.
split_penalty_comprehension=80

# The penalty for characters over the column limit.
split_penalty_excess_character=4500

# The penalty incurred by adding a line split to the unwrapped line. The
# more line splits added the higher the penalty.
split_penalty_for_added_line_split=30

# The penalty of splitting a list of "import as" names. For example:
#
#   from a_very_long_or_indented_module_name_yada_yad import (long_argument_1,
#                                                             long_argument_2,
#                                                             long_argument_3)
#
# would reformat to something like:
#
#   from a_very_long_or_indented_module_name_yada_yad import (
#       long_argument_1, long_argument_2, long_argument_3)
split_penalty_import_names=0

# The penalty of splitting the line around the 'and' and 'or'
# operators.
split_penalty_logical_operator=300

# Use the Tab character for indentation.
use_tabs=False


================================================
FILE: CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
project(poptorch)
include(GNUInstallDirs)

set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
  set(CMAKE_INSTALL_PREFIX ${CMAKE_BINARY_DIR}/install CACHE PATH "Default to local install prefix" FORCE)
endif()

set(USE_PYTORCH_PACKAGE_HEADERS ON CACHE BOOL "Use the Torch headers distributed with the pytorch package.")
set(POPLAR_DIR CACHE PATH "Path to a Poplar install")
set(POPART_DIR CACHE PATH "Path to a Popart install")
set(SNAPSHOT "" CACHE STRING "Snapshot ID to use for the documentation")
set(SDK_DIR CACHE PATH "Path to an extracted SDK archive or to a Poplar & Popart install directory (Will populate POPLAR_DIR and POPART_DIR)")
set(BUILD_DOCS OFF CACHE BOOL "Build PopTorch's documentation")
set(COPY_TESTS OFF CACHE BOOL "Copy tests files to the build folder (instead of running them from the sources folder)")
set(ENABLE_WERROR ON CACHE BOOL "Treat C++ warnings as errors")
set(EXTRA_PYTEST_ARGS "" CACHE STRING "Extra arguments to pass to pytest when generating the list of tests to run")

# Always use the gold linker to avoid segfaults with PopART / Poplar on some OSes.
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
  find_program(GOLD_EXECUTABLE ld.gold REQUIRED)
  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=gold")
  set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fuse-ld=gold")
  set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -fuse-ld=gold")
endif()

# Note: The next line is also parsed by scripts/utils/_utils.py
set(TORCH_VERSION 2.0.1)

# Convert to cmake list
string(REPLACE "." ";" TORCH_VERSION_AS_LIST ${TORCH_VERSION})

# Get the minor component. (Versions are Major.Minor.Patch)
list(GET TORCH_VERSION_AS_LIST 1 TORCH_MINOR_VERSION)

set(CMAKE_CONFIGURATION_TYPES "Release" "Debug" "MinSizeRel" "RelWithDebInfo")
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS ${CMAKE_CONFIGURATION_TYPES})
if(NOT CMAKE_BUILD_TYPE)
  list(GET CMAKE_CONFIGURATION_TYPES 0 CMAKE_BUILD_TYPE)
  message(STATUS "Setting build type to '${CMAKE_BUILD_TYPE}' as none was specified")
endif()
if(NOT CMAKE_BUILD_TYPE IN_LIST CMAKE_CONFIGURATION_TYPES)
  message(FATAL_ERROR "CMAKE_BUILD_TYPE must be one of ${CMAKE_CONFIGURATION_TYPES}")
endif()

if(USE_PYTORCH_PACKAGE_HEADERS)
  execute_process(COMMAND python3 -c "import torch; from pathlib import Path; print(Path(torch.__file__).parent, end='')"
    OUTPUT_VARIABLE TORCH_PATH)
  list(APPEND CMAKE_PREFIX_PATH ${TORCH_PATH})
  if(NOT TORCH_PATH)
    message(FATAL_ERROR "python3 -c \"import torch\" failed: check that your virtual environment is active and torch is installed")
  endif()
  execute_process(COMMAND python3 -c "import torch; import sys; sys.exit(3 if torch.version.debug else 4)"
    RESULT_VARIABLE TORCH_DEBUG)
  if(TORCH_DEBUG LESS 3 OR TORCH_DEBUG GREATER 4)
    message(FATAL_ERROR "python3 -c \"import torch\" failed: check that your virtual environment is active and torch is installed")
  endif()
  if (TORCH_DEBUG EQUAL 4)
    # We include torch header files which respond to this flag, so we have to
    # set it correctly in order to get consistent torch behaviour.
    add_compile_definitions(NDEBUG)
  endif()
endif()

if(EXISTS ${SDK_DIR})
  execute_process(COMMAND find ${SDK_DIR} -maxdepth 1 -type d -name "popart*"
    OUTPUT_VARIABLE POPART_DIR OUTPUT_STRIP_TRAILING_WHITESPACE)
  execute_process(COMMAND find ${SDK_DIR} -maxdepth 1 -type d -name "poplar-*" -o -name "poplar"
    OUTPUT_VARIABLE POPLAR_DIR OUTPUT_STRIP_TRAILING_WHITESPACE)
  if(NOT IS_DIRECTORY "${POPLAR_DIR}")
    message(FATAL_ERROR "Couldn't find a \"poplar\" or \"poplar-*\" folder in '${SDK_DIR}'")
  endif()
  if(NOT IS_DIRECTORY "${POPART_DIR}")
    message(FATAL_ERROR "Couldn't find a \"popart*\" folder in '${SDK_DIR}'")
  endif()
endif()

if(EXISTS ${POPLAR_DIR})
  list(APPEND CMAKE_PREFIX_PATH ${POPLAR_DIR})
  set(ENABLE_POPLAR_CMD "source ${POPLAR_DIR}/enable.sh")
else()
  # Check the package is not already in the path
  find_package(poplar)
  if(NOT poplar_FOUND)
    message(FATAL_ERROR "You must provide a path to a Poplar install using -DPOPLAR_DIR=/path/to/popart/build/install")
  endif()
endif()

if( EXISTS ${POPART_DIR} )
  list(APPEND CMAKE_PREFIX_PATH ${POPART_DIR})
  set(ENABLE_POPART_CMD "source ${POPART_DIR}/enable.sh")
else()
  find_package(popart COMPONENTS popart-only)
  if(NOT popart_FOUND)
    message(FATAL_ERROR "You must provide a path to a Popart build using -DPOPART_DIR=/path/to/popart/build")
  endif()
endif()

if(NOT popart_FOUND)
  find_package(popart REQUIRED COMPONENTS popart-only)
endif()
if(NOT poplar_FOUND)
  find_package(poplar REQUIRED)
endif()

get_target_property(POPLAR_LIB poplar LOCATION)
get_filename_component(POPLAR_DIR ${POPLAR_LIB} DIRECTORY)
# Run an install command that requires PopTorch, PopArt and Poplar to be in the PATH.
function(run_poptorch_install_command cmd working_directory cmd_name)
  install(CODE
    "set(ENV{LD_LIBRARY_PATH} ${popart_LIB_DIR}:${POPLAR_DIR}:$ENV{LD_LIBRARY_PATH})
    set(ENV{POPTORCH_SMALL_IPU_MODEL} 1)
    execute_process( COMMAND ${cmd} WORKING_DIRECTORY ${working_directory} RESULT_VARIABLE RETVAL OUTPUT_VARIABLE OUTPUT ERROR_VARIABLE OUTPUT)
    if(RETVAL AND NOT RETVAL EQUAL 0)
      message(FATAL_ERROR \"${cmd_name} FAILED: \${OUTPUT}\")
    endif()")
endfunction()

function(remove_use_distributed_definition target)
  get_target_property(compile_options ${target} INTERFACE_COMPILE_DEFINITIONS)
  if(NOT compile_options)
    message(FATAL_ERROR "Could not get property INTERFACE_COMPILE_DEFINITIONS from target '${target}'.")
  endif()
  list(REMOVE_ITEM compile_options USE_DISTRIBUTED)
  list(REMOVE_ITEM compile_options USE_RPC)
  set_target_properties(${target} PROPERTIES INTERFACE_COMPILE_DEFINITIONS "${compile_options}")
endfunction()

file(WRITE ${CMAKE_BINARY_DIR}/tmp/test.sh "#!$ENV{SHELL}
  source ${CMAKE_BINARY_DIR}/enable.sh
  export POPTORCH_TEST_TIMEOUT=3600
  export TORCH_SHOW_CPP_STACKTRACES=1
  ctest --output-on-failure --timeout $POPTORCH_TEST_TIMEOUT $@
  ")
file(COPY ${CMAKE_BINARY_DIR}/tmp/test.sh
  DESTINATION ${CMAKE_BINARY_DIR}
  FILE_PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE)

enable_testing()

add_compile_options(
  -Wall
  -pedantic
  -Wextra
  -Wdisabled-optimization
  -Wshadow
  -Wswitch
  -Wformat=2
  -Wimplicit-fallthrough
  -Winit-self
  -Wcomment
  -Wsequence-point
  -Wundef
  -Wuninitialized
  -DTORCH_MINOR_VERSION=${TORCH_MINOR_VERSION})

if(${ENABLE_WERROR})
  add_compile_options(-Werror)
endif()

set(CMAKE_CXX_STANDARD 17)

set(INSTALL_PYDIR ${CMAKE_INSTALL_PREFIX}/poptorch)

if(${CMAKE_SYSTEM_NAME} STREQUAL Darwin)
  set(CMAKE_INSTALL_RPATH "@loader_path")
  set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
else()
  # $ORIGIN/lib is needed by the standalone wheel: by default libraries expect
  # their dependencies to be in $ORIGIN/../lib therefore for Poplar
  # and Popart to work they need to be stored in a folder named "lib".
  # However the poptorch shared libraries which are loaded from python must be at
  # the root of the poptorch package, which is why we need to add $ORIGIN/lib
  # to the poptorch libraries.
  set(CMAKE_INSTALL_RPATH "$ORIGIN:$ORIGIN/lib")
endif()

find_package(Torch ${TORCH_VERSION} EXACT REQUIRED)

remove_use_distributed_definition(torch_cpu)

add_subdirectory(poptorch_err)
add_subdirectory(poptorch_logging)
add_subdirectory(poptorch_compiler/pytorch_bridge)

add_subdirectory(poptorch/source/dispatch_tracer)
add_subdirectory(popart_compiler)
add_subdirectory(poptorch)
add_subdirectory(python)
add_subdirectory(poptorch_geometric)

# Examples and tests
add_subdirectory(tests)
add_subdirectory(examples)
add_subdirectory(docs/user_guide)

add_custom_target(poptorch_standalone_wheel
  WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}
  COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/generate_python_package.py bdist_wheel --include-dir ${CMAKE_INSTALL_PREFIX}/include --lib-dir ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR} --output-dir ${CMAKE_INSTALL_PREFIX}/dist --python-dir ${INSTALL_PYDIR} --standalone "${popart_LIB_DIR}:${POPLAR_DIR}"
  DEPENDS poptorch
)
add_custom_target(poptorch_standalone_sdist
  WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}
  COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/generate_python_package.py sdist --include-dir ${CMAKE_INSTALL_PREFIX}/include --lib-dir ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR} --output-dir ${CMAKE_INSTALL_PREFIX}/dist --python-dir ${INSTALL_PYDIR} --standalone "${popart_LIB_DIR}:${POPLAR_DIR}"
  DEPENDS poptorch
)

add_custom_target(poptorch_wheel
  WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}
  COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/generate_python_package.py bdist_wheel --include-dir ${CMAKE_INSTALL_PREFIX}/include --lib-dir ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR} --output-dir ${CMAKE_INSTALL_PREFIX}/dist --python-dir ${INSTALL_PYDIR}
  DEPENDS poptorch
)

add_custom_target(poptorch_sdist
  WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}
  COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/generate_python_package.py sdist --include-dir ${CMAKE_INSTALL_PREFIX}/include --lib-dir ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR} --output-dir ${CMAKE_INSTALL_PREFIX}/dist --python-dir ${INSTALL_PYDIR}
  DEPENDS poptorch
)

install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/README.md
        DESTINATION .)

configure_file(${CMAKE_CURRENT_SOURCE_DIR}/scripts/enable.sh.in
  ${PROJECT_BINARY_DIR}/enable.sh
  @ONLY)

install(FILES ${CMAKE_CURRENT_BINARY_DIR}/enable.sh
        DESTINATION .)

if(${CMAKE_VERSION} VERSION_GREATER "3.15.0")
  # Building poptorch without installing it doesn't make sense: the python
  # module cannot be used so always install after a build.
  add_custom_target(install_poptorch ALL
      COMMAND ${CMAKE_COMMAND} --install ${CMAKE_BINARY_DIR}
      DEPENDS poptorch custom_cube_op custom_leaky_relu_op custom_add_scalar_op custom_add_scalar_vec_op custom_add_vec_scalar_mul_op custom_reduce_op custom_three_input_reduce_op custom_many_attribute_op
  )
endif()


================================================
FILE: License.txt
================================================
The MIT License (MIT)

Copyright (c) 2020 Graphcore Limited

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


================================================
FILE: MANIFEST.in
================================================
recursive-include include *.hpp
include poptorch/lib/*
include poptorch/lib/poplar_rt/*
include poptorch/lib/graphcore/lib/*.a
include *.py
include *.toml
include License.txt
include poptorch_third_party_licenses.txt


================================================
FILE: README.md
================================================
# PopTorch and PopTorch Geometric.

## PopTorch - PyTorch integration for the Graphcore IPU

PopTorch is a set of extensions for PyTorch enabling models
to be trained, evaluated and used on the Graphcore IPU.

More information can be found in the [PopTorch User Guide](https://docs.graphcore.ai/projects/poptorch-user-guide/).

## PopTorch Geometric - PyTorch Geometric integration for the Graphcore IPU

PopTorch Geometric is a set of extensions for PyTorch Geometric, enabling Graph
Neural Network models to be trained, evaluated and used on the Graphcore IPU.
PopTorch Geometric depends on the functionality provided by PopTorch.

More information can be found in the [PopTorch Geometric User Guide](https://docs.graphcore.ai/projects/poptorch-geometric-user-guide).


## Prerequisites

These instructions assume you are building PopTorch and PopTorch Geometric on Ubuntu 20.04.

To install and run PopTorch and PopTorch Geometric you will need:

- Python 3.8
- pip3 >= 18.1
- The Poplar SDK

```sh
sudo apt install -y python3 python3-pip
```

To build PopTorch and PopTorch Geometric from sources you will need all of the above and:

- git
- curl
- g++

```sh
sudo apt install -y git curl g++
```

To build the documentation you will also need LaTeX:

```sh
sudo apt install -y texlive-full
```

## Install the Poplar SDK

The Poplar SDK can be downloaded from: https://www.graphcore.ai/downloads.

Set the following environment variable to point to the installed Poplar SDK:

```sh
export SDK_PATH=/path/to/poplar_sdk-ubuntu_20_04*
```

PopTorch must be built against a compatible version of the SDK. For example, the "sdk-release-3.2" branch of PopTorch must be built against Poplar SDK 3.2.

## Installation

Make sure `pip3` is up to date (You need `pip3 >= 18.1`):

```sh
pip3 install -U pip --user
```

Install the PopTorch wheel (Torch will automatically be installed in the
process):

```sh
pip3 install ${SDK_PATH}/poptorch-*.whl
```

Once the PopTorch wheel has been installed, PopTorch Geometric wheel can be
installed if needed (PyTorch Geometric will automatically be installed in
the process):

```sh
pip3 install ${SDK_PATH}/poptorch_geometric-*.whl
```

## Usage

The PopTorch wheel doesn't include the PopART and Poplar binaries, so you need to make sure they are in your path before loading PopTorch or PopTorch Geometric.
This is done by sourcing their respective `enable.sh` scripts:

```sh
. ${SDK_PATH}/poplar-ubuntu_20_04*/enable.sh
. ${SDK_PATH}/popart-ubuntu_20_04*/enable.sh
```

You can check everything is in order by running:

```sh
python3 -c "import poptorch;print(poptorch.__version__)"
```

And similarly for PopTorch Geometric:

```sh
python3 -c "import poptorch_geometric;print(poptorch_geometric.__version__)"
```

More information can be found in the [PopTorch User Guide](https://docs.graphcore.ai/projects/poptorch-user-guide/)

## Build instructions

We use [Mambaforge](https://github.com/conda-forge/miniforge#mambaforge) as build environment manager.

1. Clone the PopTorch repository

```sh
git clone https://github.com/graphcore/poptorch.git
```

2. Create a folder for your build

```sh
mkdir build
cd build
```

3. Create a build environment and install the dependencies.

```sh
../poptorch/scripts/create_buildenv.py
```

4. Activate the build environment

```sh
. activate_buildenv.sh
```

5. Configure the build

```sh
cmake ../poptorch -DSDK_DIR=${SDK_PATH} -GNinja
```

By default, PopTorch will be built in release mode. To build in debug mode add `-DCMAKE_BUILD_TYPE=Debug`.

To build the documentation, add `-DBUILD_DOCS=ON`. The HTML and PDF documentation will be generated in `docs/`.

6. Compile the PopTorch and PopTorch Geometric libraries

```sh
ninja install
```

If you're only going to use PopTorch or PopTorch Geometric for development purposes then you can stop here.
Source the enable script in the PopTorch build folder and you can start using PopTorch:

```sh
. enable.sh
python3 -c "import poptorch;print(poptorch.__version__)"
```

Similarly for PopTorch Geometric:
```sh
. enable.sh
python3 -c "import poptorch_geometric;print(poptorch_geometric.__version__)"
```

7. (Optional) Build the PopTorch wheel

```sh
ninja poptorch_wheel
```

The wheel will be created in `install/dist`.

8. (Optional) Build the PopTorch Geometric wheel

```sh
ninja poptorch_geometric_wheel
```

The wheel will be created in `install/dist`.

### Run the tests

To run the tests:

```sh
# Run all the tests, print the output only on failure, run 80 tests in parallel
./test.sh -j80
# PopTorch has 3 test labels: examples, short, long. To run all the tests except the long ones:
./test.sh -j80 -LE long
# To run only the short tests
./test.sh -j80 -L short
# Filter the tests by name using -R
./test.sh -j80 -R half_
# For more information:
./test.sh --help
```

Note: If you run the tests in parallel, make sure to tell PopTorch to wait for an IPU to become available if they are all in use:

```sh
export POPTORCH_WAIT_FOR_IPU=1
```

Tests can also be run individually using `pytest`:

```sh
. enable.sh
python3 -m pytest ../poptorch/tests/options_test.py
# add -s to get the whole output
# -k to filter the tests by name
python3 -m pytest ../poptorch/tests/options_test.py -s -k popart
```

Tests specific for Graph Neural Networks are located in `tests/gnn/` subdirectory:

```sh
. enable.sh
python3 -m pytest ../poptorch/tests/gnn/test_basic_gnn.py
# add -s to get the whole output
# -k to filter the tests by name
python3 -m pytest ../poptorch/tests/gnn/test_basic_gnn.py -s -k GraphSAGE
```

## Feedback / issues

Please create issues [here](https://github.com/graphcore/poptorch/issues)


================================================
FILE: config.buildenv.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

_llvm_version = "13.0.1"

config.setDefault(build_documentation=True)

installers.add(PipRequirements("requirements.txt"))

installers.add(PipRequirements("poptorch_geometric/requirements.txt"))

installers.add(
    CondaPackages(
        "boost-cpp=1.72.0",
        "ccache=4.3",
        "cmake=3.18.2",
        "libstdcxx-ng=11.2.0",
        "make=4.3",
        "ninja=1.10.2",
        "pybind11=2.6.1",
        "pyyaml=5.3.1",
        # Mamba overwrites that package, so it must explicitly
        # appear here with the correct version.
        "setuptools=58.0.4",
        "spdlog=1.8.0",
        # Mamba overwrites that package, so it must explicitly
        # appear here with the correct version.
        "typing-extensions=4.1.1",
        # Mamba overwrites that package, so it must explicitly
        # appear here with the correct version.
        "wheel=0.34.2",
        "zip=3.0"))

if config.build_documentation:
    installers.add(
        CondaPackages(
            "breathe=4.25.1",
            "docutils==0.16",
            "hunspell=1.7.0",
            # Indirect dependency of sphinx which
            # doesn't get automatically installed.
            "jinja2=3.0.3",
            "latexmk=4.55",
            "sphinx=3.3.1",
            "sphinx_rtd_theme=0.5.0",
        ))

if config.install_linters:
    installers.add(
        CondaPackages(
            "pre-commit=3.3.3",
            "clang-tools=" + _llvm_version,
            "pylint=2.7.2",
            "yapf=0.27.0",
            # To preserve the comments when updating the schemas
            "ruamel.yaml=0.17.21",
        ))


class DownloadExternalDatasets(Installer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.downloader_path = os.path.join(_utils.sources_dir(), 'scripts',
                                            'download_external_datasets.py')
        if not os.path.exists(self.downloader_path):
            raise RuntimeError(f'Path {self.downloader_path} not exists.')

    def hashString(self):
        with open(self.downloader_path, "r") as f:
            return f.read()

    def install(self, env):
        datasets_path = os.path.join(env.prefix, "external_datasets")
        env.run_commands(f"mkdir {datasets_path}",
                         f"python3 {self.downloader_path} {datasets_path}")


installers.add(DownloadExternalDatasets())


================================================
FILE: docs/common/_static/css/custom_rtd.css
================================================
/* Copyright (c) 2020 Graphcore Ltd. All rights reserved.
   Override the sphinx-readthedocs-theme settings */

/*  improve table layout, allowing cells to wrap */
.rst-content table.docutils col {
  width: auto;
}

.wy-table-responsive table td, .wy-table-responsive table th,
.rst-content .wy-table-responsive table td, .rst-content .wy-table-responsive table th {
  white-space: normal;
  word-wrap: normal;
  border: solid 2px #e1e4e5;
}

th.head p {
  margin-bottom: 0;
}

/* make inline code/literal text less ugly */
/*
.rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
  color: #565656;
  font-size: 90%;
  padding: 0;
  background: #ffffff;
  border: none;
}
*/
/* make image captions a sensible size and format */

.rst-content .toctree-wrapper p.caption {
  font-size: 14px;
  font-weight: 700;
  font-family: "Graphik", "Lato", "Helvetica Neue", Arial, sans-serif;
  text-align: center;
  margin-top: 14px;
}

/* change background colour for code samples */

div.highlight,
.rst-content pre.literal-block,
.rst-content pre.literal-block div[class^='highlight'],
.rst-content div[class^='highlight'],
.rst-content div[class^='highlight'] pre,
.rst-content div[class^='highlight'] div[class^='highlight'],
.rst-content div[class^='highlight'] td.code {
  background-color: white ;
  color: #292c31;
}

/* remove ugly top border from definition lists */

.rst-content dl:not(.docutils) dt {
  border-top: none;
}

/* Style nav menu in Graphcore colours and fonts */

.wy-menu-vertical li.toctree-l1 span.toctree-expand,
.wy-menu-vertical li span.toctree-expand,
.wy-menu-vertical li.on a span.toctree-expand,
.wy-menu-vertical li a.current span.toctree-expand {
  color: white;
  visibility: visible;
  display: inline-block;
  mix-blend-mode: difference;
}

.fa-home:before, .icon-home:before {
  display: none;
}

a.icon,
a.icon:visited,
a.icon:hover,
a.icon-home,
a.icon-home:hover,
a.icon-home:visited {
  color: white;
  font-weight: bold;
}

.btn {
  font-family: "Graphik", "Lato", "proxima-nova", "Helvetica Neue", Arial, sans-serif;
}

input[type="button"], input[type="reset"], input[type="submit"],
input[type="text"], input[type="password"], input[type="email"], input[type="url"], input[type="date"], input[type="month"], input[type="time"], input[type="datetime"], input[type="datetime-local"], input[type="week"], input[type="number"], input[type="search"], input[type="tel"], input[type="color"] {
  font-family: "Graphik", "Lato", "proxima-nova", "Helvetica Neue", Arial, sans-serif;
}

textarea {
  font-family: "Graphik", "Lato", "proxima-nova", "Helvetica Neue", Arial, sans-serif;
}

a:link {
  color: #ff6f79;
}

a:visited {
  color: #ff6f79;
}

a:hover {
  color: #fbc3aa;
}

body {
  font-family: "Graphik", "Lato", "Helvetica Neue", Arial, sans-serif;
  color: #565656;
}

h1, h2, h3, h4, h5, h6, legend {
  font-family: "Graphik", "Lato", "Helvetica Neue", Arial, sans-serif;
  color: #292c31;
}

h1 {
  text-transform: uppercase;
  font-family: "GC Quantized", "Roboto Slab", "ff-tisa-web-pro", "Georgia", Arial, sans-serif;
}

.rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal,
.rst-content pre.literal-block, .rst-content .linenodiv pre,
.rst-content div[class^='highlight'] pre,
code, .rst-content tt, .rst-content code,
.rst-content pre, .rst-content kbd, .rst-content samp,
footer span.commit code, footer span.commit .rst-content tt, .rst-content footer span.commit tt {
  font-family: Consolas, "Andale Mono WT", "Andale Mono", "Lucida Console", "Lucida Sans Typewriter", "DejaVu Sans Mono", "Bitstream Vera Sans Mono", "Liberation Mono", "Nimbus Mono L", Monaco, "Courier New", Courier, monospace;
  color: #292c31;
}

.wy-menu-vertical li.toctree-l1 >a,
.wy-menu-vertical li.toctree-l1 >a:visited {
    color: #e9e9e9 !important;
}

.wy-menu-vertical li.toctree-l1.current a,
.wy-menu-vertical li a {
    color: #292c31 !important;
}

.wy-menu-vertical a:hover {
  background: #ff6f79 !important;
  color: #fff  !important;
}

.wy-menu,
.wy-menu-vertical,
.wy-nav-side,
.wy-side-nav-search {
  background-color: #292c31;
}

.wy-side-nav-search>div.version {
  color: #fff;
  text-align: left;
  padding: 0 .75em;
}

.rst-content .sidebar .sidebar-title {
  font-family: "GC Quantized", "Roboto Slab", "ff-tisa-web-pro", "Georgia", Arial, sans-serif;
}

.rst-content .footnote-reference, .rst-content .citation-reference {
  vertical-align: super;
}

.rst-content table.docutils.citation, .rst-content table.docutils.footnote {
  color: #999;
}

.wy-breadcrumbs {
  display: none;
}

footer p {
  color: #565656;
  font-size: 15px;
}


================================================
FILE: docs/common/conf.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
# Configuration file for the Sphinx documentation builder.
#
# -- Path setup --------------------------------------------------------------

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import pathlib
import json
import sys
import datetime
sys.path.insert(0, os.path.abspath('.'))

# -- Project information -----------------------------------------------------

project = 'Project'
author = 'Graphcore Ltd'

# The full version, including alpha/beta/rc tags
# Looks like html uses 'version' and latex uses 'release'
version = 'v0.0.0'

# -- General configuration ---------------------------------------------------

# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
    'sphinx.ext.graphviz',
    'sphinx.ext.autodoc',
    'sphinx.ext.extlinks',
]


def get_current_release() -> str:
    format_str = "{major}.{minor}"
    version_file = pathlib.Path(__file__).parents[2].resolve() / "version.json"
    return format_str.format(**json.load(open(version_file)))


SDK_RELEASE = get_current_release()
extlinks = {
    'tutorials-repo':
    (f'https://github.com/graphcore/tutorials/tree/sdk-release-{SDK_RELEASE}/%s',
     None),
    'github-poptorch':
    (f'https://github.com/graphcore/poptorch/tree/sdk-release-{SDK_RELEASE}/%s',
     None),
}

# Add any paths that contain templates here, relative to this directory.
templates_path = []

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = []

nitpick_ignore = [
    ('py:class', 'bool'),
    ('py:class', 'dict'),
    ('py:class', 'int'),
    ('py:class', 'iterable'),
    ('py:class', 'optional'),
    ('py:class', 'str'),
    ('py:class', 'T_co'),
    ('py:class', 'datetime.timedelta'),
    ('py:class', 'torch.Tensor'),
    ('py:class', 'torch.dtype'),
    ('py:class', 'torch.nn.Module'),
    ('py:class', 'torch.optim.Optimizer'),
    ('py:class', 'torch.optim.optimizer.Optimizer'),
    ('py:class', 'torch.utils.data.Dataset'),
    ('py:class', 'torch.utils.data.sampler.Sampler'),
    # Enums already described in functions that use them
    ('py:class', 'poptorch.OutputMode'),
    ('py:class', 'poptorch.ConnectionType'),
    ('py:class', 'poptorch.HalfFloatCastingBehavior'),
    ('py:class', 'poptorch.MatMulSerializationMode'),
    ('py:class', 'poptorch.OverlapMode'),
    ('py:class', 'poptorch.ReductionType'),
    ('py:class', 'poptorch.SyncPattern'),
    ('py:class', 'poptorch.MeanReductionStrategy'),
    # Type hints
    ('py:data', 'typing.Optional'),
    ('py:data', 'typing.Callable'),
    ('py:class', 'typing.ForwardRef'),
]

# Define abbreviations for IPU-PODn names with subscripts
# These use non-breaking hyphens & spaces, so be careful if editing or adding new definitions
pod_sizes = [2**i for i in range(2, 14)] + ["N"]
pod_definitions = [
    f".. |POD{i}| replace:: IPU‑POD\\ :subscript:`{i}`" for i in pod_sizes
]
bow_definitions = [
    f".. |BOW{i}| replace:: Bow Pod\\ :subscript:`{i}`" for i in pod_sizes
]

rst_epilog = ("\n".join(pod_definitions) + "\n" + "\n".join(bow_definitions) +
              r"""
.. role:: raw-html(raw)
    :format: html

.. |POD| replace:: IPU‑POD
.. |BOW| replace:: Bow Pod
.. |newpage| raw:: latex

  \newpage

.. |LEGAL:EULA| replace::
    This software is made available under the terms of the `Graphcore End User
    License Agreement (EULA) <https://docs.graphcore.ai/en/latest/eula.html>`__
    and the
    `Graphcore Container License Agreement <https://docs.graphcore.ai/projects/container-license/en/latest/>`__.
    Please ensure you have read and accept the terms of the corresponding
    license before using the software. The Graphcore EULA applies unless
    indicated otherwise.

.. |LEGAL:TRADEMARkS| replace::
    Graphcloud®, Graphcore®, Poplar® and PopVision® are registered trademarks
    of Graphcore Ltd.
    :raw-html:`<br></br>`
    Bow™,
    Bow-2000™,
    Bow Pod™,
    Colossus™,
    In-Processor-Memory™,
    IPU-Core™,
    IPU-Exchange™,
    IPU-Fabric™,
    IPU-Link™,
    IPU-M2000™,
    IPU-Machine™,
    IPU-POD™,
    IPU-Tile™,
    PopART™,
    PopDist™,
    PopLibs™,
    PopRun™,
    PopTorch™,
    Streaming Memory™
    and Virtual-IPU™
    are trademarks of Graphcore Ltd.
    :raw-html:`<br></br>`
    All other trademarks are the property of their respective owners.

""" + f".. |YEAR| replace:: {datetime.date.today().year}" + "\n" +
              f".. |SDK_RELEASE| replace:: {SDK_RELEASE}"
              "\n")

# -- Options for HTML output -------------------------------------------------

# The theme to use for HTML and HTML Help pages.  See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'

html_theme_options = {'logo_only': False, 'navigation_depth': 4}

numfig = True

numfig_format = {
    'section': 'Section {number}, {name}',
    'figure': 'Fig. %s',
    'table': 'Table %s',
    'code-block': 'Listing %s'
}

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']

# CSS file to create the Graphcore style
html_css_files = [
    'css/custom_rtd.css',
]

# The name for this set of Sphinx documents.  If None, it defaults to
# "<project> v<release> documentation".
html_title = 'Document Title'

# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
html_logo = 'graphcorelogo-html.png'

# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
html_use_smartypants = True

# If true, links to the reST sources are added to the pages.
html_show_sourcelink = False

# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
html_show_sphinx = False

# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
html_show_copyright = False

# -- Options for LaTeX output ---------------------------------------------
# Don't know how much of this is necessary. It's a bit of a mess.

# pifont required for tick and cross characters
# use array stretch to get taller table rows. Also consider sing extrarowheight
# \\setlength{\\extrarowheight}{1pt}

ADDITIONAL_PREAMBLE = r"""
\setcounter{secnumdepth}{5}
\setcounter{tocdepth}{5}

\usepackage{threeparttable}
\usepackage{pifont}
\usepackage{array}
\usepackage{charter}
\usepackage[defaultsans]{lato}
\usepackage{inconsolata}
\usepackage{listings}
\usepackage{verbatim}
\usepackage{multicol}
\usepackage{float}
\usepackage{fancyhdr}
%% Obtain access to ssmall font size
\usepackage[10pt]{moresize}

%% adjustbox used to set max width and height for images
\usepackage[export]{adjustbox}

%% Define a right-justified table column type
\usepackage{ragged2e}
\newcolumntype{R}[1]{>{\RaggedLeft\arraybackslash}p{#1}}

\renewcommand{\arraystretch}{1.4}

\usepackage{booktabs}
\usepackage{graphicx}

%% Push footnotes to the bottom of the page
\usepackage[bottom]{footmisc}

\usepackage{pdfpages}

\usepackage{pdflscape}

\usepackage{transparent}
\usepackage[normalem]{ulem}

%% Watermark stuff
\usepackage{draftwatermark}
\SetWatermarkFontSize{2cm}
\SetWatermarkColor[gray]{0.96}
\SetWatermarkText{}
\SetWatermarkScale{2}
\SetWatermarkAngle{30}

%% Ensure API descriptions are all tt family
\let\fulllineitemsOld\fulllineitems
\let\endfulllineitemsOld\endfulllineitems
\renewenvironment{fulllineitems}{\ttfamily\small\fulllineitemsOld}{\endfulllineitemsOld}

%% Change the Sphinx verbatim to not put the box around it and to indent
\renewcommand{\Verbatim}[1][1]{%
  % list starts new par, but we don't want it to be set apart vertically
  \bgroup\parskip=0pt%
  \medskip
  % The list environment is needed to control perfectly the vertical
  % space.
  \list{}{%
  \setlength\parskip{0pt}%
  \setlength\itemsep{0ex}%
  \setlength\topsep{0ex}%
  \setlength\partopsep{0pt}%
  \setlength\leftmargin{0pt}%
  }%
  \OriginalVerbatim[#1,xleftmargin=0.5cm,formatcom=\normalsize]%
}
\renewcommand{\endVerbatim}{%
    \endOriginalVerbatim%
  \endlist%
  % close group to restore \parskip
  \egroup%
}

\newcommand{\VerbBorders}{%
  \renewcommand{\Verbatim}[1][1]{%
    % list starts new par, but we don't want it to be set apart vertically
    \bgroup\parskip=0pt%
    \smallskip%
    % The list environment is needed to control perfectly the vertical
    % space.
    \list{}{%
      \setlength\parskip{0pt}%
      \setlength\itemsep{0ex}%
      \setlength\topsep{0ex}%
      \setlength\partopsep{0pt}%
      \setlength\leftmargin{0pt}%
    }%
    \item\MakeFramed {\FrameRestore}%
    \small%
    \OriginalVerbatim[##1]%
  }
  \renewcommand{\endVerbatim}{%
    \endOriginalVerbatim%
    \endMakeFramed%
    \endlist%
    % close group to restore \parskip
    \egroup%
  }
  \definecolor{VerbatimColor}{rgb}{0.95,0.95,0.95}
  \definecolor{VerbatimBorderColor}{rgb}{1.0,1.0,1.0}
}

\makeatletter
\DeclareTextCommandDefault{\textleftarrow}{\mbox{$\m@th\leftarrow$}}
\makeatother
"""

ADDITIONAL_PREAMBLE += r"""
%% Redefine sphinxstylethead (used only for table headers) to bold font
\usepackage{letltxmacro}
\LetLtxMacro{\oldtextsf}{\sphinxstyletheadfamily}
\renewcommand{\sphinxstyletheadfamily}[0]{\oldtextsf \bf }
"""

ADDITIONAL_PREAMBLE += r"""
\makeatletter
  \fancypagestyle{normal}{
    \fancyhf{}
    \fancyfoot[RE,RO]{{\py@HeaderFamily\thepage}}
    \fancyfoot[LE,LO]{%(footer)s}
    \renewcommand{\headrulewidth}{0.4pt}
    \renewcommand{\footrulewidth}{0.4pt}
  }
  \fancypagestyle{plain}{
    \fancyhf{}
    \fancyfoot[RE,RO]{{\py@HeaderFamily\thepage}}
    \fancyfoot[LE,LO]{%(footer)s}
    \renewcommand{\headrulewidth}{0pt}
    \renewcommand{\footrulewidth}{0.4pt}
  }
\makeatother
""" % {
    'footer': ''
}

# From Sphinx 1.5 onwards, there are certain macros which are used which became
# deprecated (e.g. \code). These macros should be upgraded in the future so
# that we can move away from using the old macro names.
latex_keep_old_macro_names = True

latex_elements = {
    # Options to pass to packages
    'passoptionstopackages':
    r'\PassOptionsToPackage{dvipsnames, table, xcdraw}{xcolor}',

    # Set up margins for geometry
    'sphinxsetup': 'hmargin={0.75in, 0.75in}, vmargin={0.75in, 0.75in}',

    # The paper size ('letterpaper' or 'a4paper').
    'papersize': 'a4paper',

    # Single sided to save paper and improve display
    'extraclassoptions': 'openany,oneside',

    # The font size ('10pt', '11pt' or '12pt').
    'pointsize': '10pt',

    # Disable figure floating
    'figure_align': 'H',

    # Additional stuff for the LaTeX preamble.
    'preamble': ADDITIONAL_PREAMBLE,
}

# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
#  author, documentclass [howto, manual, or own class]).
latex_class = 'manual'
if 'DOC_TITLE' in os.environ:
    latex_title = os.environ['DOC_TITLE']
else:
    latex_title = "Document title"
latex_documents = [
    ('index', 'doc.tex', latex_title, author, latex_class),
]

# The name of an image file (relative to this directory) to place at the top of
# the title page.
latex_logo = 'graphcorelogo-pdf.png'

# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
latex_use_parts = False

# If true, show page references after internal links.
latex_show_pagerefs = False

# If true, show URL addresses after external links.
# latex_show_urls = False

# Documents to append as an appendix to all manuals.
# latex_appendices = []

# If false, no module index is generated.
latex_domain_indices = False

# https://www.sphinx-doc.org/en/master/usage/extensions/autodoc.html
autodoc_default_options = {
    'undoc-members': True,
}
autodoc_inherit_docstrings = True

autodoc_typehints = 'description'


================================================
FILE: docs/common/custom_dic
================================================
accessor
AdamW
AMSGrad
AsyncRebatched
autograd
backend
booleans
bwd
checkpointed
checkpointing
codepaths
config
connectionist
const
constness
CTC
dict
EOF
float16
float32
FP16
InputChannels
ints
IO
ipu
IPU
IPUs
iterable
L2
libpvti
matmul
Mk1
Mk2
mpirun
Nesterov
num
OpenMPI
OutputChannels
PopART
PopART's
PopDist
PopLibs
PopRun
PopTorch
precompile
pvti
PyTorch
PyTorch's
rebatch
rebatched
rebatching
recomputation
ReducingDim
replan
RMSprop
RTS
serializable
SGD
sharded
sharding
stdout
str
submodules
TODO
tracepoint
tracepoints
tracepointsints
unrounded
unroundedPopRunsubmodules
bool


================================================
FILE: docs/poptorch_geometric/common/conf.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
# Configuration file for the Sphinx documentation builder.
#
# -- Path setup --------------------------------------------------------------

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import pathlib
import json
import sys
import datetime
sys.path.insert(0, os.path.abspath('.'))

# -- Project information -----------------------------------------------------

project = 'Project'
author = 'Graphcore Ltd'

# The full version, including alpha/beta/rc tags
# Looks like html uses 'version' and latex uses 'release'
version = 'v0.0.0'

# -- General configuration ---------------------------------------------------

# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
    'sphinx.ext.napoleon',
    'sphinx.ext.graphviz',
    'sphinx.ext.autodoc',
    'sphinx.ext.extlinks',
]


def get_current_release() -> str:
    format_str = "{major}.{minor}"
    version_file = pathlib.Path(__file__).parents[3].resolve() / "version.json"
    return format_str.format(**json.load(open(version_file)))


SDK_RELEASE = get_current_release()
extlinks = {
    'tutorials-repo':
    (f'https://github.com/graphcore/tutorials/tree/sdk-release-{SDK_RELEASE}/%s',
     None),
    'github-poptorch':
    (f'https://github.com/graphcore/poptorch/tree/sdk-release-{SDK_RELEASE}/%s',
     None),
}

# Add any paths that contain templates here, relative to this directory.
templates_path = []

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = []

nitpick_ignore = [
    ('py:obj', 'num_nodes * (num_nodes - 1)'),
    ('py:obj', '0.0'),
    ('py:obj', '1'),
    ('py:obj', '2'),
    ('py:obj', 'graphs_mask'),
    ('py:obj', 'nodes_mask'),
    ('py:obj', 'edges_mask'),
    ('py:obj', 'True'),
    ('py:obj', 'False'),
    ('py:obj', 'num_nodes'),
    ('py:obj', 'num_edges'),
    ('py:obj', 'num_graphs'),
    ('py:obj', 'None'),
    ('py:obj', 'data_source'),
    ('py:obj', 'follow_batch'),
    ('py:obj', 'exclude_keys'),
    ('py:obj', 'batch_sampler'),
    ('py:obj', 'shuffle'),
    ('py:obj', 'Data'),
    ('py:class', 'bool'),
    ('py:class', 'dict'),
    ('py:class', 'int'),
    ('py:class', 'iterable'),
    ('py:class', 'optional'),
    ('py:class', 'str'),
    ('py:class', 'T_co'),
    ('py:class', 'Dataset'),
    ('py:class', 'ClusterData'),
    ('py:class', 'Sampler'),
    ('py:class', '..'),
    ('py:class', 'torch.Tensor'),
    ('py:class', 'torch.utils.data.dataset.Dataset'),
    ('py:class', 'torch.utils.data.sampler.Sampler'),
    ('py:class', 'torch_geometric.data.Batch'),
    ('py:class', 'torch_geometric.data.Data'),
    ('py:class', 'torch_geometric.data.Dataset'),
    ('py:class', 'torch_geometric.data.dataset.Dataset'),
    ('py:class', 'torch_geometric.data.HeteroData'),
    ('py:class', 'torch_geometric.loader.cluster.ClusterData'),
    ('py:class', 'torch_geometric.loader.ClusterData'),
    ('py:class', 'poptorch.AsynchronousDataAccessor'),
    # Enums already described in functions that use them
    ('py:class', 'poptorch.DataLoader'),
    ('py:class', 'poptorch.Dataset'),
    ('py:class', 'poptorch.Options'),
    ('py:class', 'bool'),
    ('py:class', 'dict'),
    ('py:class', 'int'),
    ('py:class', 'iterable'),
    ('py:class', 'optional'),
    ('py:class', 'str'),
    ('py:class', 'T_co'),
    ('py:class', 'torch.Tensor'),
    ('py:class', 'torch.dtype'),
    ('py:class', 'torch.nn.Module'),
    ('py:class', 'torch.optim.Optimizer'),
    ('py:class', 'torch.optim.optimizer.Optimizer'),
    ('py:class', 'torch.utils.data.Dataset'),
    ('py:class', 'torch.utils.data.sampler.Sampler'),
    # Enums already described in functions that use them
    ('py:class', 'poptorch.OutputMode'),
    ('py:class', 'poptorch.ConnectionType'),
    ('py:class', 'poptorch.HalfFloatCastingBehavior'),
    ('py:class', 'poptorch.MatMulSerializationMode'),
    ('py:class', 'poptorch.OverlapMode'),
    ('py:class', 'poptorch.ReductionType'),
    ('py:class', 'poptorch.SyncPattern'),
    ('py:class', 'poptorch.MeanReductionStrategy'),
    # Type hints
    ('py:data', 'typing.Optional'),
    ('py:data', 'typing.Callable'),
    ('py:class', 'typing.ForwardRef'),
]

# Define abbreviations for IPU-PODn names with subscripts
# These use non-breaking hyphens & spaces, so be careful if editing or adding new definitions
pod_sizes = [2**i for i in range(2, 14)] + ["N"]
pod_definitions = [
    f".. |POD{i}| replace:: IPU‑POD\\ :subscript:`{i}`" for i in pod_sizes
]
bow_definitions = [
    f".. |BOW{i}| replace:: Bow Pod\\ :subscript:`{i}`" for i in pod_sizes
]

rst_epilog = ("\n".join(pod_definitions) + "\n" + "\n".join(bow_definitions) +
              r"""
.. role:: raw-html(raw)
    :format: html

.. |POD| replace:: IPU‑POD
.. |BOW| replace:: Bow Pod
.. |newpage| raw:: latex

  \newpage

.. |LEGAL:EULA| replace::
    This software is made available under the terms of the `Graphcore End User
    License Agreement (EULA) <https://docs.graphcore.ai/en/latest/eula.html>`__
    and the
    `Graphcore Container License Agreement <https://docs.graphcore.ai/projects/container-license/en/latest/>`__.
    Please ensure you have read and accept the terms of the corresponding
    license before using the software. The Graphcore EULA applies unless
    indicated otherwise.

.. |LEGAL:TRADEMARkS| replace::
    Graphcloud®, Graphcore®, Poplar® and PopVision® are registered trademarks
    of Graphcore Ltd.
    :raw-html:`<br></br>`
    Bow™,
    Bow-2000™,
    Bow Pod™,
    Colossus™,
    In-Processor-Memory™,
    IPU-Core™,
    IPU-Exchange™,
    IPU-Fabric™,
    IPU-Link™,
    IPU-M2000™,
    IPU-Machine™,
    IPU-POD™,
    IPU-Tile™,
    PopART™,
    PopDist™,
    PopLibs™,
    PopRun™,
    PopTorch™,
    Streaming Memory™
    and Virtual-IPU™
    are trademarks of Graphcore Ltd.
    :raw-html:`<br></br>`
    All other trademarks are the property of their respective owners.

""" + f".. |YEAR| replace:: {datetime.date.today().year}" + "\n" +
              f".. |SDK_RELEASE| replace:: {SDK_RELEASE}"
              "\n")

# -- Options for HTML output -------------------------------------------------

# The theme to use for HTML and HTML Help pages.  See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'

html_theme_options = {'logo_only': False, 'navigation_depth': 4}

numfig = True

numfig_format = {
    'section': 'Section {number}, {name}',
    'figure': 'Fig. %s',
    'table': 'Table %s',
    'code-block': 'Listing %s'
}

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['../../common/_static']

# CSS file to create the Graphcore style
html_css_files = [
    'css/custom_rtd.css',
]

# The name for this set of Sphinx documents.  If None, it defaults to
# "<project> v<release> documentation".
html_title = 'Document Title'

# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
html_logo = '../../common/graphcorelogo-html.png'

# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
html_use_smartypants = True

# If true, links to the reST sources are added to the pages.
html_show_sourcelink = False

# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
html_show_sphinx = False

# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
html_show_copyright = False

# -- Options for LaTeX output ---------------------------------------------
# Don't know how much of this is necessary. It's a bit of a mess.

# pifont required for tick and cross characters
# use array stretch to get taller table rows. Also consider sing extrarowheight
# \\setlength{\\extrarowheight}{1pt}

ADDITIONAL_PREAMBLE = r"""
\setcounter{secnumdepth}{5}
\setcounter{tocdepth}{5}

\usepackage{threeparttable}
\usepackage{pifont}
\usepackage{array}
\usepackage{charter}
\usepackage[defaultsans]{lato}
\usepackage{inconsolata}
\usepackage{listings}
\usepackage{verbatim}
\usepackage{multicol}
\usepackage{float}
\usepackage{fancyhdr}
%% Obtain access to ssmall font size
\usepackage[10pt]{moresize}

%% adjustbox used to set max width and height for images
\usepackage[export]{adjustbox}

%% Define a right-justified table column type
\usepackage{ragged2e}
\newcolumntype{R}[1]{>{\RaggedLeft\arraybackslash}p{#1}}

\renewcommand{\arraystretch}{1.4}

\usepackage{booktabs}
\usepackage{graphicx}

%% Push footnotes to the bottom of the page
\usepackage[bottom]{footmisc}

\usepackage{pdfpages}

\usepackage{pdflscape}

\usepackage{transparent}
\usepackage[normalem]{ulem}

%% Watermark stuff
\usepackage{draftwatermark}
\SetWatermarkFontSize{2cm}
\SetWatermarkColor[gray]{0.96}
\SetWatermarkText{}
\SetWatermarkScale{2}
\SetWatermarkAngle{30}

%% Ensure API descriptions are all tt family
\let\fulllineitemsOld\fulllineitems
\let\endfulllineitemsOld\endfulllineitems
\renewenvironment{fulllineitems}{\ttfamily\small\fulllineitemsOld}{\endfulllineitemsOld}

%% Change the Sphinx verbatim to not put the box around it and to indent
\renewcommand{\Verbatim}[1][1]{%
  % list starts new par, but we don't want it to be set apart vertically
  \bgroup\parskip=0pt%
  \medskip
  % The list environment is needed to control perfectly the vertical
  % space.
  \list{}{%
  \setlength\parskip{0pt}%
  \setlength\itemsep{0ex}%
  \setlength\topsep{0ex}%
  \setlength\partopsep{0pt}%
  \setlength\leftmargin{0pt}%
  }%
  \OriginalVerbatim[#1,xleftmargin=0.5cm,formatcom=\normalsize]%
}
\renewcommand{\endVerbatim}{%
    \endOriginalVerbatim%
  \endlist%
  % close group to restore \parskip
  \egroup%
}

\newcommand{\VerbBorders}{%
  \renewcommand{\Verbatim}[1][1]{%
    % list starts new par, but we don't want it to be set apart vertically
    \bgroup\parskip=0pt%
    \smallskip%
    % The list environment is needed to control perfectly the vertical
    % space.
    \list{}{%
      \setlength\parskip{0pt}%
      \setlength\itemsep{0ex}%
      \setlength\topsep{0ex}%
      \setlength\partopsep{0pt}%
      \setlength\leftmargin{0pt}%
    }%
    \item\MakeFramed {\FrameRestore}%
    \small%
    \OriginalVerbatim[##1]%
  }
  \renewcommand{\endVerbatim}{%
    \endOriginalVerbatim%
    \endMakeFramed%
    \endlist%
    % close group to restore \parskip
    \egroup%
  }
  \definecolor{VerbatimColor}{rgb}{0.95,0.95,0.95}
  \definecolor{VerbatimBorderColor}{rgb}{1.0,1.0,1.0}
}

\makeatletter
\DeclareTextCommandDefault{\textleftarrow}{\mbox{$\m@th\leftarrow$}}
\makeatother
"""

ADDITIONAL_PREAMBLE += r"""
%% Redefine sphinxstylethead (used only for table headers) to bold font
\usepackage{letltxmacro}
\LetLtxMacro{\oldtextsf}{\sphinxstyletheadfamily}
\renewcommand{\sphinxstyletheadfamily}[0]{\oldtextsf \bf }
"""

ADDITIONAL_PREAMBLE += r"""
\makeatletter
  \fancypagestyle{normal}{
    \fancyhf{}
    \fancyfoot[RE,RO]{{\py@HeaderFamily\thepage}}
    \fancyfoot[LE,LO]{%(footer)s}
    \renewcommand{\headrulewidth}{0.4pt}
    \renewcommand{\footrulewidth}{0.4pt}
  }
  \fancypagestyle{plain}{
    \fancyhf{}
    \fancyfoot[RE,RO]{{\py@HeaderFamily\thepage}}
    \fancyfoot[LE,LO]{%(footer)s}
    \renewcommand{\headrulewidth}{0pt}
    \renewcommand{\footrulewidth}{0.4pt}
  }
\makeatother
""" % {
    'footer': ''
}

# From Sphinx 1.5 onwards, there are certain macros which are used which became
# deprecated (e.g. \code). These macros should be upgraded in the future so
# that we can move away from using the old macro names.
latex_keep_old_macro_names = True

latex_elements = {
    # Options to pass to packages
    'passoptionstopackages':
    r'\PassOptionsToPackage{dvipsnames, table, xcdraw}{xcolor}',

    # Set up margins for geometry
    'sphinxsetup': 'hmargin={0.75in, 0.75in}, vmargin={0.75in, 0.75in}',

    # The paper size ('letterpaper' or 'a4paper').
    'papersize': 'a4paper',

    # Single sided to save paper and improve display
    'extraclassoptions': 'openany,oneside',

    # The font size ('10pt', '11pt' or '12pt').
    'pointsize': '10pt',

    # Disable figure floating
    'figure_align': 'H',

    # Additional stuff for the LaTeX preamble.
    'preamble': ADDITIONAL_PREAMBLE,
}

# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
#  author, documentclass [howto, manual, or own class]).
latex_class = 'manual'
if 'DOC_TITLE' in os.environ:
    latex_title = os.environ['DOC_TITLE']
else:
    latex_title = "Document title"
latex_documents = [
    ('index', 'doc.tex', latex_title, author, latex_class),
]

# The name of an image file (relative to this directory) to place at the top of
# the title page.
latex_logo = '../../common/graphcorelogo-pdf.png'

# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
latex_use_parts = False

# If true, show page references after internal links.
latex_show_pagerefs = False

# If true, show URL addresses after external links.
# latex_show_urls = False

# Documents to append as an appendix to all manuals.
# latex_appendices = []

# If false, no module index is generated.
latex_domain_indices = False

# https://www.sphinx-doc.org/en/master/usage/extensions/autodoc.html
autodoc_default_options = {
    'undoc-members': True,
}
autodoc_inherit_docstrings = True

autodoc_typehints = 'description'


================================================
FILE: docs/poptorch_geometric/user_guide/index.rst
================================================
PyTorch Geometric for the IPU: User Guide
=========================================

.. toctree::
    :maxdepth: 4
    :numbered: 3

    intro
    installation
    performance
    tutorials
    supported_operations
    reference
    legal


================================================
FILE: docs/poptorch_geometric/user_guide/installation.rst
================================================
.. _installation:

============
Installation
============

PopTorch Geometric is included as part of the Poplar SDK (see the `Getting
Started guide
<https://docs.graphcore.ai/en/latest/getting-started.html#getting-started>`_ for
your system for how to install the Poplar SDK. ). PopTorch Geometric is packaged
as a Python wheel file that can be installed using ``pip``. PopTorch Geometric
requires the installation of PopTorch, which is also a part of the Poplar SDK.

To use PopTorch Geometric you must first install the PopTorch wheel
and then the PopTorch Geometric wheel. All the necessary
dependencies (including ``torch`` and ``pytorch_geometric``) will be installed
automatically.


.. important:: pip >= 18.1 is required for PopTorch dependencies to be
    installed properly.

To update `pip`:

.. code-block:: bash

    $ pip install -U pip


Version compatibility
~~~~~~~~~~~~~~~~~~~~~

PopTorch Geometric and PopTorch wheels should always come from the same Poplar
SDK version to guarantee version compatibility.

The following are the corresponding ``poptorch``, ``torch``, ``torchvision`` and ``torchaudio``
versions and supported Python versions.

+------------------------+-----------------------+-------------+-----------------+----------------+------------+
| ``poptorch_geometric`` | ``pytorch_geometric`` |  ``torch``  | ``torchvision`` | ``torchaudio`` | ``python`` |
+========================+=======================+=============+=================+================+============+
|          3.3           |   2.4.0.dev20230613   |    2.0.1    |      0.15.2     |      2.0.1     |   >= 3.8   |
+------------------------+-----------------------+-------------+-----------------+----------------+------------+
|          3.2           |   2.3.0.dev20230222   |    1.13.1   |      0.14.1     |      0.13.1    |   >= 3.7   |
+------------------------+-----------------------+-------------+-----------------+----------------+------------+

.. note:: To ensure version compatibility, ``torchvision`` and ``torchaudio`` are automatically installed with PopTorch in Poplar SDK 3.3 and later.

Installation using Python virtual environment
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

We recommend creating and activating a virtual environment to isolate your
PopTorch Geometric environment from the system Python environment. You can use
the Python ``virtualenv`` tool for this.

.. code-block:: bash

    $ virtualenv -p python3 poptorch_test
    $ source poptorch_test/bin/activate

After activating the virtual environment, you need to first install the PopTorch wheel.

.. code-block:: bash

    $ pip install <sdk_path>/poptorch-x.x.x.whl

where ``<sdk_path>`` is the location of the Poplar SDK on your system.

See the
`PopTorch installation guide <https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/installation.html>`_
for more information on installing the PopTorch wheel.

Then, install the PopTorch Geometric wheel:

.. code-block:: bash

    # Enable the Python environment containing PopTorch (if not already enabled)
    $ source poptorch_test/bin/activate
    $ pip install <sdk_path>/poptorch_geometric-x.x.x.whl

where ``<sdk_path>`` is the location of the Poplar SDK on your system.

Setting the environment variables
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

The PopART and Poplar runtime libraries are required to use PopTorch Geometric,
so you will need to set the library search paths, using the scripts provided
in the SDK:

.. code-block:: bash

    # Enable the Python environment containing PopTorch (if not already enabled)
    $ source poptorch_test/bin/activate

    # Add the Poplar and PopART runtime libraries to the search path
    $ source <sdk_path>/poplar-ubuntu_<os_ver>-<poplar_ver>+<build>/enable.sh
    $ source <sdk_path>/popart-ubuntu_<os_ver>-<poplar_ver>+<build>/enable.sh

where ``<sdk_path>`` is the location of the Poplar SDK on your system, ``<os_ver>`` is the version of Ubuntu on your system, ``<poplar_ver>`` is the software version number of the Poplar SDK and ``<build>`` is the build information.


Validating the setup
~~~~~~~~~~~~~~~~~~~~

In order to validate that everything is installed correctly in your
environment, you can run the following commands and see if they execute without
an exception and the displayed version matches the packages that you installed:

.. code-block:: bash

    $ python -c "import poptorch;print(poptorch.__version__)"
    $ python -c "import poptorch_geometric;print(poptorch_geometric.__version__)"


================================================
FILE: docs/poptorch_geometric/user_guide/intro.rst
================================================
============
Introduction
============

.. admonition:: Experimental Release

    This is an experimental release of PopTorch Geometric. Not all features of PyTorch Geometric are supported, and some functions may not work as expected. The implementation may change without warning in future releases in ways that are not backwards compatible.

PopTorch Geometric is a set of extensions for PyTorch Geometric, enabling Graph
Neural Network models to be trained, evaluated and used on Graphcore IPU
hardware.

PopTorch Geometric has been designed to require as few changes as possible to
your models to run on the IPU.
However, it does have some differences from native PyTorch Geometric execution,
in order to get the most out of IPU hardware.

PopTorch Geometric depends on the functionality provided by PopTorch.

PopTorch and PopTorch Geometric are included in the `Poplar SDK <https://docs.graphcore.ai/projects/sdk-overview/>`__. See the `Getting Started guide <https://docs.graphcore.ai/en/latest/getting-started.html#getting-started>`_ for your system for how to
install the Poplar SDK. Refer to :numref:`installation` for how to install the PopTorch and PopTorch Geometric wheels.


================================================
FILE: docs/poptorch_geometric/user_guide/legal.rst
================================================
Legal notices
=============

|LEGAL:TRADEMARKS|

|LEGAL:EULA|

© Copyright 2023 Graphcore Ltd. All rights reserved.


================================================
FILE: docs/poptorch_geometric/user_guide/performance.rst
================================================
======================
Optimizing performance
======================

PopTorch Geometric is an extension of PyTorch Geometric allowing models to
fully utilize the IPU hardware and provide the best performance. To achieve
that, PopTorch Geometric uses PopTorch functionality. PopTorch Geometric is
designed in such a way that users can run PyTorch Geometric models with the
least amount of changes to the code and exploit the high performance of IPU
systems.

When working with the IPU, it is always recommended to use fixed-size tensors.
This allows for the static compilation of the Poplar programs and using the same
programs for all the iterations of training and/or inference. This constraint
is not always met when working with Graph Neural Networks because graphs
processed in subsequent iterations can have different numbers of nodes and/or
edges, which results in tensors of different shapes. PopTorch Geometric provides
ways to satisfy this constraint and reach the best performance.

Currently, there are two ways to ensure that all the tensors have fixed
shapes---using either the
`Pad <https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.transforms.Pad.html#torch_geometric.transforms.Pad>`_
transformation with data loader or the fixed-size data loaders.

.. important:: When working with the IPU, it is required to always use the data
    loader from PopTorch Geometric, either
    :py:class:`poptorch_geometric.dataloader.DataLoader`
    or :py:class:`poptorch_geometric.dataloader.FixedSizeDataLoader`.

All the data loaders in PopTorch Geometric take the `options` argument.
It can be used to set
`PopTorch options <https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/reference.html#poptorch.Options>`_
to process data even more efficiently.

It is recommended to read the
`Efficient data batching <https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/batching.html>`_
chapter of the PopTorch documentation, to understand the possible settings of
the `options` argument.

Pad transformation
==================

`Pad <https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.transforms.Pad.html#torch_geometric.transforms.Pad>`_
transformation is a graph transformation implemented in PyTorch Geometric. It
sets the fixed number of nodes and edges for all the graphs in the dataset and
pads the node- and edge-level feature tensors so their sizes match the number
of nodes and edges, respectively. Thanks to that, when the data loader creates
a batch of graphs, all the feature tensors of the batch have the same fixed
size and computations can be performed with high efficiency.

A dataset transformed using `Pad` must be used with the
:py:class:`poptorch_geometric.dataloader.DataLoader` data loader to guarantee
compatibility with the IPU.

.. note:: If the dataset you are working on already has a fixed-size feature
    tensors, then using `Pad` transformation is not necessary and it is enough
    to use the :py:class:`poptorch_geometric.dataloader.DataLoader` data
    loader.

Using `Pad` transformation with
:py:class:`poptorch_geometric.dataloader.DataLoader` is recommended when the
graphs in the dataset have a similar number of nodes and edges, so the number
of padding nodes and edges is small.

For examples of usage, refer to :numref:`examples_and_tutorials`.

Fixed-size data loaders
=======================

The alternative method is to use the
:py:class:`poptorch_geometric.dataloader.FixedSizeDataLoader` class with the
dataset without the `Pad` transformation. The data loader uses
:py:class:`poptorch_geometric.collate.FixedSizeCollater` underneath to
create mini-batches of graphs with a fixed number of nodes and edges from the
initial graphs that do not necessarily have the same number of nodes and edges.
The data loader combines graphs from the dataset and creates dummy graphs such
that the whole mini-batch has a fixed number of nodes, edges and graphs.

By default the `FixedSizeStrategy.PadToMax` strategy is used, which pads the
mini-batches to a fixed-size where the resulting mini-batches have a fixed
number of samples in each mini-batch and one padding graph at the end of the
mini-batch.

The data loader can also produce packed batches with a variable number of
graphs in each mini-batch. This can help reduce the amount of space in each
mini-batch assigned to padding. This is enabled by using
`FixedSizeStrategy.StreamPack` which changes the underlying sampler to
:py:class:`poptorch_geometric.stream_packing_sampler.StreamPackingSampler`.
In this case, each mini-batch contains a certain number of dummy graphs, so
that the total number of graphs in the mini-batch is constant.

Compared to `Pad` transformation, instead of padding each sample in the batch,
the data loader pads the entire batch, which is often more efficient and the
created batches are easier to manage since all the padding nodes and edges are
at the end.

For examples of usage, refer to :numref:`examples_and_tutorials`.


================================================
FILE: docs/poptorch_geometric/user_guide/reference.rst
================================================
.. _reference:

=============
API reference
=============

.. _api_options:

Data loaders
============

.. autoclass:: poptorch_geometric.dataloader.DataLoader

.. autoclass:: poptorch_geometric.dataloader.FixedSizeDataLoader

.. autoclass:: poptorch_geometric.pyg_dataloader.FixedSizeStrategy

.. autoclass:: poptorch_geometric.pyg_dataloader.OverSizeStrategy

Cluster data loaders
====================

.. autoclass:: poptorch_geometric.cluster_loader.FixedSizeClusterLoader

Collators
=========

.. autoclass:: poptorch_geometric.collate.FixedSizeCollater

Batch samplers
==============

.. autoclass:: poptorch_geometric.stream_packing_sampler.StreamPackingSampler

Fixed size options
==================

.. autoclass:: poptorch_geometric.fixed_size_options.FixedSizeOptions


================================================
FILE: docs/poptorch_geometric/user_guide/supported_operations.rst
================================================
.. _supported_operations:

====================
Supported operations
====================

This chapter contains a list of the PyTorch Geometric operations supported by the Poplar SDK.

.. table:: Supported operations
    :widths: 15, 25, 17, 43

    +-------------------------------+---------------------------+---------------+---------------------------------------------------+
    | Layers kind                   | Layer name                | Status        | Notes                                             |
    +===============================+===========================+===============+===================================================+
    | Basic blocks                  | Linear                    | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | Sequential                | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | HeteroLinear              | Supported     |                                                   |
    +-------------------------------+---------------------------+---------------+---------------------------------------------------+
    | Convolution Layers            | SimpleConv                | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | ChebConv                  | Not supported |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | SAGEConv                  | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | CuGraphSAGEConv           | Not supported |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | GraphConv                 | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | GatedGraphConv            | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | ResGatedGraphConv         | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | CuGraphGATConv            | Not supported |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | FusedGATConv              | Not supported |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | TransformerConv           | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | TAGConv                   | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | GINConv                   | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | GINEConv                  | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | ARMAConv                  | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | APPNP                     | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | MFConv                    | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | RGCNConv                  | Not supported |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | CuGraphRGCNConv           | Not supported |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | SplineConv                | Supported     | Only inference is supported.                      |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | NNConv                    | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | CGConv                    | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | EdgeConv                  | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | DynamicEdgeConv           | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | XConv                     | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | HypergraphConv            | Supported     | ``num_edges`` is required.                        |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | LEConv                    | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | PNAConv                   | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | ClusterGCNConv            | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | PANConv                   | Not supported |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | WLConv                    | Not supported |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | WLConvContinuous          | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | SuperGATConv              | Not supported |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | GeneralConv               | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | HGTConv                   | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | HeteroConv                | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | LGConv                    | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | PointGNNConv              | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | GPSConv                   | Not supported |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | RGATConv                  | Supported     | The ``attention_mechanism`` option                |
    |                               |                           |               | ``within-relation`` is not supported.             |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | FiLMConv                  | Supported     | ``num_relations`` cannot be greater than 1.       |
    |                               |                           |               |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | GCNConv                   | Supported     | ``add_self_loops`` must be set to False.          |
    |                               +---------------------------+               |                                                   |
    |                               | GravNetConv               |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | GATConv                   |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | GATv2Conv                 |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | AGNNConv                  |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | SGConv                    |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | SSGConv                   |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | FastRGCNConv              |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | SignedConv                |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | DNAConv                   |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | PointNetConv              |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | GMMConv                   |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | PPFConv                   |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | FeaStConv                 |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | PointTransformerConv      |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | GENConv                   |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | GCN2Conv                  |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | FAConv                    |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | EGConv                    |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | PDNConv                   |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | HEATConv                  |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | HANConv                   |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | AntiSymmetricConv         |               |                                                   |
    +-------------------------------+---------------------------+---------------+---------------------------------------------------+
    | Aggregation Layers            | MultiAggregation          | Supported     | ``dim_size`` is required.                         |
    |                               +---------------------------+               |                                                   |
    |                               | SumAggregation            |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | MeanAggregation           |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | MaxAggregation            |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | MinAggregation            |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | MulAggregation            |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | VarAggregation            |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | StdAggregation            |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | SoftmaxAggregation        |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | PowerMeanAggregation      |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | DeepSetsAggregation       |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | AttentionalAggregation    |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | Set2Set                   |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | DegreeScalerAggregation   |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | MedianAggregation         |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | QuantileAggregation       |               |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | SortAggregation           | Supported     | * ``dim_size`` is required.                       |
    |                               +---------------------------+               | * ``max_num_elements`` is required.               |
    |                               | LSTMAggregation           |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | MLPAggregation            |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | GRUAggregation            |               |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | SetTransformerAggregation | Not supported |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | GraphMultisetTransformer  | Not supported |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | EquilibriumAggregation    | Not supported |                                                   |
    +-------------------------------+---------------------------+---------------+---------------------------------------------------+
    | Normalization layers          | BatchNorm                 | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | HeteroBatchNorm           | Not supported |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | HeteroLayerNorm           | Not supported |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | InstanceNorm              | Supported     | ``batch_size`` is required.                       |
    |                               +---------------------------+               |                                                   |
    |                               | GraphNorm                 |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | LayerNorm                 |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | GraphSizeNorm             |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | PairNorm                  |               |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | MeanSubtractionNorm       | Supported     | ``dim_size`` is required.                         |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | MessageNorm               | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | DiffGroupNorm             | Supported     |                                                   |
    +-------------------------------+---------------------------+---------------+---------------------------------------------------+
    | Pooling layers                | Pooling                   | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | global_add_pool           | Supported     | ``size`` is required.                             |
    |                               +---------------------------+               |                                                   |
    |                               | global_mean_pool          |               |                                                   |
    |                               +---------------------------+               |                                                   |
    |                               | global_max_pool           |               |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | TopKPooling               | Not supported |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | PANPooling                | Not supported |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | SAGPooling                | Not supported |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | EdgePooling               | Not supported |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | ASAPooling                | Not supported |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | MemPooling                | Supported     | * ``batch_size`` is required.                     |
    |                               |                           |               | * ``max_num_nodes`` is required.                  |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | max_pool                  | Not supported |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | avg_pool                  | Not supported |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | consecutive_cluster       | Not supported |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | max_pool_neighbor_x       | Not supported |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | avg_pool_neighbor_x       | Not supported |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | max_pool_x                | Supported     | * ``batch_size`` is required.                     |
    |                               +---------------------------+               | * ``size`` is required.                           |
    |                               | avg_pool_x                |               |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | graclus                   | Not supported |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | voxel_grid                | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | fps                       | Supported     | * ``ptr`` is required.                            |
    |                               |                           |               | * ``batch`` has to be ``None``.                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | knn                       | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | knn_graph                 | Supported     | ``loop`` must be set to True.                     |
    |                               |                           |               |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | radius                    | Supported     | ``batch_size`` is required.                       |
    |                               +---------------------------+               |                                                   |
    |                               | radius_graph              |               |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | nearest                   | Supported     | * ``torch_cluster.nearest`` has to be replaced    |
    |                               |                           |               |   with ``poptorch.nearest``.                      |
    |                               |                           |               | * ``poptorch.nearest`` supports  arguments        |
    |                               |                           |               |   ``batch_x`` and ``batch_y`` in the original     |
    |                               |                           |               |   form of ``torch.Tensor`` plus a regular list.   |
    |                               |                           |               | * Validation of batch indices is NOT performed if |
    |                               |                           |               |   ``batch_x`` and ``batch_y`` are passed as       |
    |                               |                           |               |   ``torch.Tensor``.                               |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | decimation_indices        | Not supported |                                                   |
    +-------------------------------+---------------------------+---------------+---------------------------------------------------+
    | Unspooling layers             | knn_interpolate           | Supported     |                                                   |
    +-------------------------------+---------------------------+---------------+---------------------------------------------------+
    | Functional                    | bro                       | Not supported |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | gini                      | Supported     |                                                   |
    +-------------------------------+---------------------------+---------------+---------------------------------------------------+
    | Dense convolutional layers    | DenseGCNConv              | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | DenseGINConv              | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | DenseGraphConv            | Supported     |                                                   |
    +                               +---------------------------+---------------+---------------------------------------------------+
    |                               | DenseSAGEConv             | Supported     |                                                   |
    +                               +---------------------------+---------------+---------------------------------------------------+
    |                               | DenseGATConv              | TBD           |                                                   |
    +-------------------------------+---------------------------+---------------+---------------------------------------------------+
    | Dense pooling layers          | dense_diff_pool           | Not supported |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | dense_mincut_pool         | Not supported |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | DMoNPooling               | Not supported |                                                   |
    +-------------------------------+---------------------------+---------------+---------------------------------------------------+
    | Encoding                      | PositionalEncoding        | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | TemporalEncoding          | Supported     |                                                   |
    +-------------------------------+---------------------------+---------------+---------------------------------------------------+
    | KGE Models                    | TransE                    | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | ComplEx                   | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | DistMult                  | Supported     |                                                   |
    |                               +---------------------------+---------------+---------------------------------------------------+
    |                               | RotatE                    | Supported     |                                                   |
    +-------------------------------+---------------------------+---------------+---------------------------------------------------+


================================================
FILE: docs/poptorch_geometric/user_guide/tutorials.rst
================================================
.. _examples_and_tutorials:

======================
Examples and tutorials
======================

Examples demonstrating different use scenarios for PopTorch Geometric are
available in the
`Graphcore examples repository on GitHub <https://github.com/graphcore/examples>`_.

Tutorials in the form of Jupyter notebooks are available in the `PyTorch Geometric tutorials directory <https://github.com/graphcore/examples/tree/master/tutorials/tutorials/pytorch_geometric>`__. These tutorials show how to get the maximum benefit from IPU systems with PopTorch Geometric.


================================================
FILE: docs/user_guide/CMakeLists.txt
================================================
set(LONG_TESTS mnist inferenceModel)

function(add_poptorch_py_user_guide_example name path)
  message(STATUS "Adding python example '${name}'")
  set(extra_labels "")
  if("${name}" STREQUAL "pipeline_simple")
    set(extra_labels ";external_data")
  else()
    if("${name}" IN_LIST LONG_TESTS)
      set(extra_labels "")
    else()
      set(extra_labels ";short")
    endif()
  endif()

  add_test(NAME "${name}_user_guide_example"
           COMMAND python3 ${path}/${name}.py
           WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
  set_tests_properties("${name}_user_guide_example" PROPERTIES LABELS "user_guide_examples${extra_labels}")
endfunction()

install(FILES "poptorch.conf" DESTINATION "${PROJECT_BINARY_DIR}/tmp")

file(GLOB EXAMPLES "${CMAKE_CURRENT_SOURCE_DIR}/*.py")
if(COPY_TESTS)
  install(FILES ${EXAMPLES} DESTINATION "${CMAKE_CURRENT_BINARY_DIR}")
  set(DOC_EXAMPLES_PATH "${CMAKE_CURRENT_BINARY_DIR}")
else()
  set(DOC_EXAMPLES_PATH "${CMAKE_CURRENT_SOURCE_DIR}")
endif()

foreach(EXAMPLE ${EXAMPLES})
  get_filename_component(NAME ${EXAMPLE} NAME_WE)
  add_poptorch_py_user_guide_example(${NAME} ${DOC_EXAMPLES_PATH})
endforeach()

if(BUILD_DOCS)
  run_poptorch_install_command(
    "python3 ${PROJECT_SOURCE_DIR}/scripts/docs_build.py --install-dir ${CMAKE_INSTALL_PREFIX} --add-to-sys-path ${CMAKE_INSTALL_PREFIX}"
    "${PROJECT_BINARY_DIR}"
    "docs_build.py")
endif()


================================================
FILE: docs/user_guide/api.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import tempfile
import os
import torch
import poptorch


# ctc_beam_search_start
class Model(torch.nn.Module):
    def forward(self, log_probs, lengths):
        return poptorch.ctc_beam_search_decoder(log_probs, lengths)


# ctc_beam_search_end


# print_tensor_start
class ExampleModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.bias = torch.nn.Parameter(torch.zeros(()))

    def forward(self, x):
        x = x + 1

        # It is important to make sure the result of the print is used.
        x = poptorch.ipu_print_tensor(x)

        return x + self.bias


# print_tensor_end

model = poptorch.inferenceModel(ExampleModel())
model(torch.tensor([1.0, 2.0, 3.0]))


# identity_start
def custom_loss(output, target):
    # Mean squared error with a scale
    loss = output - target
    loss = loss * loss * 5
    return poptorch.identity_loss(loss, reduction="mean")


class ExampleModelWithCustomLoss(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.model = ExampleModel()

    def forward(self, input, target):
        out = self.model(input)
        return out, custom_loss(out, target)


# identity_end

model_with_loss = ExampleModelWithCustomLoss()
poptorch_model = poptorch.trainingModel(model_with_loss)

print(f"Bias before training: {model_with_loss.model.bias}")

for _ in range(100):
    out, loss = poptorch_model(input=torch.tensor([1.0, 2.0, 3.0]),
                               target=torch.tensor([3.0, 4.0, 5.0]))
    print(f"Out = {out}, loss = {float(loss):.2f}")

print(f"Bias after training: {model_with_loss.model.bias}")

torch.testing.assert_close(model_with_loss.model.bias,
                           torch.tensor(1.0, dtype=torch.float),
                           rtol=1e-4,
                           atol=1e-5)
poptorch_model.destroy()

model = ExampleModelWithCustomLoss()
input = torch.tensor([1.0, 2.0, 3.0])
target = torch.tensor([3.0, 4.0, 5.0])
options = poptorch.Options()
# optim_start
opt = poptorch.optim.SGD(model.parameters(),
                         lr=0.01,
                         loss_scaling=2.0,
                         use_combined_accum=False)
poptorch_model = poptorch.trainingModel(model, options, opt)
poptorch_model(input, target)
# Update optimizer attribute
opt.loss_scaling = 1.0
# Update param_group attribute
opt.param_groups[0]["loss_scaling"] = 1.0
# Set the new optimizer in the model
poptorch_model.setOptimizer(opt)
poptorch_model(input, target)
# optim_end
poptorch_model.destroy()

# optim_const_start
# lr, momentum and loss_scaling will be marked as variable.
opt = poptorch.optim.SGD(model.parameters(),
                         lr=0.01,
                         momentum=0.0,
                         use_combined_accum=False)
# momentum and loss_scaling  will be marked as constant.
opt = poptorch.optim.SGD(model.parameters(), lr=0.01, use_combined_accum=False)
# lr and momentum will be marked as variable.
# loss_scaling will be marked as constant.
opt = poptorch.optim.SGD(model.parameters(),
                         lr=0.01,
                         momentum=0.0,
                         loss_scaling=2.0,
                         use_combined_accum=False)
opt.variable_attrs.markAsConstant("loss_scaling")
# lr, momentum and loss_scaling will be marked as variable.
opt = poptorch.optim.SGD(model.parameters(),
                         lr=0.01,
                         loss_scaling=2.0,
                         use_combined_accum=False)
opt.variable_attrs.markAsVariable("momentum")
# optim_const_end

# torch_optim_const_start
# momentum will be marked as constant (It's not set)
opt = torch.optim.SGD(model.parameters(), lr=0.01)
# lr will be marked as variable.
# momentum will still be marked as constant (Because its default value is 0.0)
opt = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.0)
# lr and momentum will both be marked as variable.
opt = torch.optim.SGD(model.parameters(), lr=0.01, momentum=1.0)
# torch_optim_const_end

# conf_load_start
opts = poptorch.Options()
opts.loadFromFile("tmp/poptorch.conf")
# conf_load_end

with tempfile.TemporaryDirectory() as d:
    PATH = os.path.join(d, "checkpoint.pt")
    # optim_state_dict_start
    optimizer = poptorch.optim.Adam(model.parameters())
    poptorch_model = poptorch.trainingModel(model, optimizer=optimizer)
    poptorch_model(input, target)

    # Saving the optimizer state
    torch.save({'optimizer_state_dict': optimizer.state_dict()}, PATH)

    # Destroy original model to prevent an error when wrapping the model again
    poptorch_model.destroy()

    new_optimizer = poptorch.optim.Adam(model.parameters())
    # Loading the optimizer state back
    checkpoint = torch.load(PATH)
    new_optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

    # The new training model will use the loaded optimizer state
    new_poptorch_model = poptorch.trainingModel(model, optimizer=optimizer)
    # optim_state_dict_end


================================================
FILE: docs/user_guide/batching.rst
================================================
.. _efficient_data_batching:

=======================
Efficient data batching
=======================

By default, PopTorch will process the ``batch_size`` which you provided to
the :py:class:`~poptorch.DataLoader`. This value is known as the micro-batch
size.

When using the other options below, the actual number of samples used per step
varies to allow the IPU(s) to process data more efficiently.

However, the effective batch size for operations which depend on it (for example the size of mini-batches, in PyTorch's terminology, when using Pytorch's `BatchNorm <https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm2d.html>`__ layers) will not change. All that changes is how much data is
actually sent for a single step.

.. note:: Failure to use :py:class:`~poptorch.DataLoader` may result in
   accidentally changing the effective batch size for operations which depend on
   it, such as batch normalization.

:tutorials-repo:`PopTorch tutorial: Efficient data loading  <tutorials/pytorch/efficient_data_loading>`
is a detailed tutorial regarding efficient data loading, batching and tuning relevant hyperparameters in PopTorch.

poptorch.DataLoader
===================

PopTorch provides a thin wrapper around the traditional `torch.utils.data.DataLoader <https://pytorch.org/docs/1.10.0/data.html#torch.utils.data.DataLoader>`_
to abstract away some of the batch sizes calculations. If :py:class:`~poptorch.DataLoader`
is used in a distributed execution environment, it will ensure that each process uses
a different subset of the dataset.

If you set the :py:class:`~poptorch.DataLoader` ``batch_size`` to more than 1
then each operation in the model will process that number of elements at any
given time. Please see the usage example below.

poptorch.AsynchronousDataAccessor
=================================

To reduce host overhead you can offload the data loading process to a
separate thread by specifying :py:class:`mode=poptorch.DataLoaderMode.Async <poptorch.DataLoaderMode>` in the
:py:class:`~poptorch.DataLoader` constructor. Internally this uses an
:py:class:`~poptorch.AsynchronousDataAccessor`. Doing this allows you to reduce
the host/IPU communication overhead by using the time that the IPU is running
to load the next batch on the CPU. This means that when the IPU is finished
executing and returns to host the data will be ready for the IPU to pull in again.

.. literalinclude:: device_iterations.py
  :caption: Use of AsynchronousDataAccessor
  :start-after: data_accessor_start
  :end-before: data_accessor_end
  :emphasize-lines: 10
  :linenos:

.. warning:: Tensors being iterated over using an
  :py:class:`~poptorch.AsynchronousDataAccessor` use shared memory. You must clone
  tensors at each iteration if you wish to keep their references outside of each
  iteration.

  Consider the following example:

  .. code-block:: python
    :emphasize-lines: 5

    predictions, labels = [], []

    for data, label in dataloader:
        predictions += poptorch_model(data)
        labels += label

  The ``predictions`` list will be correct because it's producing a new tensor from the
  inputs. However, The list ``labels`` will contain identical references. This line
  would need to be replaced with the following:

  .. code-block:: python

    labels += label.detach().clone()

Rebatching iterable datasets
----------------------------

There are `two types of datasets in PyTorch <https://pytorch.org/docs/1.10.0/data.html#dataset-types>`_ : map-style datasets and iterable datasets.

As explained in the notes of PyTorch's `Data Loading Order and Sampler <https://pytorch.org/docs/1.10.0/data.html#data-loading-order-and-sampler>`_: for
`IterableDataset <https://pytorch.org/docs/1.10.0/data.html#torch.utils.data.IterableDataset>`_:
"When fetching from iterable-style datasets with multi-processing, the drop_last argument drops the
last non-full batch of each worker's dataset replica."

This means that if the number of elements is naively
divided among the number of workers (which is the default behaviour) then potentially a significant number of elements will be dropped.

For example:

.. code-block:: python

  num_tensors = 100
  num_workers = 7
  batch_size = 4

  per_worker_tensors = ceil(100 / num_workers) = 15
  last_worker_tensors = 100 - (num_workers - 1) * per_worker_tensors = 10

  num_tensors_used = batch_size * (floor(per_worker_tensors / batch_size) * (num_workers - 1) + floor(last_worker_tensors / batch_size))
                   = 80

This means in this particular case 20% of the dataset will never be used. But, in general the larger the number of workers and the batch size, the more data will end up being unused.

To work around this issue PopTorch has a :py:class:`mode=poptorch.DataLoaderMode.AsyncRebatched <poptorch.DataLoaderMode>`.
PopTorch will set the ``batch_size`` in the PyTorch Dataset and DataLoader to 1 and will instead create the batched tensors in its worker process.

The shape of the tensors returned by the DataLoader will be the same as before, but the number of used tensors from the dataset  will increase to
``floor(num_tensors / batch_size) * batch_size`` (which means all the tensors would be used in the example above).

.. note:: This flag is not enabled by default because the behaviour is different from the upstream DataLoader.

.. _device_iterations:

poptorch.Options.deviceIterations
=================================

When training, a device iteration corresponds to one iteration of the training
loop executed on the IPU, starting with data loading, followed by the forward
and backward passes, and ending with a weight update. If
:ref:`gradient accumulation <gradient_accumulation>` is not used then if you set
:py:meth:`~poptorch.Options.deviceIterations` to `n`, PopTorch will carry out
this loop `n` times (processing `n` micro-batches) on the IPU before returning
control to the host, which will improve processing efficiency. If gradient
accumulation is used then the number of micro-batches processed will be `n`
multiplied by the value set using
:py:meth:`~poptorch.options._TrainingOptions.gradientAccumulation`.

For inference, a device iteration corresponds to data loading and the forward pass.

Note that the returned output dimensions depend on
:py:meth:`~poptorch.Options.outputMode`. The default value for
:py:func:`~poptorch.trainingModel` is `Final`, since you will often not need to
receive all or any of the output tensors and it is more efficient not to
receive them. Therefore, only the last batch of data will be returned to the
host under this setting. You can change this behaviour by setting the value of
:py:meth:`~poptorch.Options.outputMode`. to `All`. This returns the result of
every batch to the host.

.. note:: When running an
  :py:class:`~poptorch.inferenceModel` with
  :py:class:`~poptorch.PipelinedExecution`, you must set
  :py:meth:`~poptorch.Options.deviceIterations` to at least the number of
  pipeline steps.


.. literalinclude:: device_iterations.py
  :caption: Use of device iterations and batch size
  :start-after: iterations_start
  :end-before: iterations_end
  :emphasize-lines: 51, 57, 63
  :linenos:

poptorch.Options.replicationFactor
==================================

:py:meth:`~poptorch.Options.replicationFactor` will replicate the model over
multiple IPUs to allow automatic data parallelism across many IPUs.

.. literalinclude:: device_iterations.py
  :caption: Use of replication factor
  :start-after: replication_start
  :end-before: replication_end
  :emphasize-lines: 13
  :linenos:

poptorch.Options.inputReplicaGrouping
=====================================

:py:meth:`~poptorch.Options.inputReplicaGrouping` allows the input batches to
be split between groups of replicas, in a similar way to what
:py:func:`~replicaGrouping` does for weight tensors. See
:numref:`grouping_tensor_weights`.

.. _gradient_accumulation:

poptorch.Options.Training.gradientAccumulation
==============================================

You can use
:py:meth:`~poptorch.options._TrainingOptions.gradientAccumulation`
to run a number of micro-batches before updating parameters (weights) during
training. The number of gradient accumulations is equal to the number of
micro-batches (batches whose size is specified as the ``batch_size`` value
provided to the :py:class:`~poptorch.DataLoader`) which are processed between
model updates. After accumulation, PopTorch updates the model using the
gradients accumulated from processing all the batches.

.. note:: When running an :py:class:`~poptorch.inferenceModel`,  you must set
  :py:meth:`~poptorch.options._TrainingOptions.gradientAccumulation` to 1.

As mentioned in :numref:`pipelined_execution`, you need to use gradient
accumulations when training with :py:class:`~poptorch.PipelinedExecution`
because the parameters can only be updated between pipeline runs.  You need to
set the number of accumulations to at least the number of pipeline stages.
However, with this value, the pipeline will switch into the "ramp-down"
period as soon as it has finished the "ramp-up" period. Using a larger number
of gradient accumulations means that the pipeline will run at full efficiency
for longer. However, the increase in batches between parameter updates may
reduce the overall training efficiency of your model. The optimal number of
gradient accumulations is a trade off between these two factors.

.. note:: :py:meth:`~poptorch.options._TrainingOptions.gradientAccumulation`
   is only needed by :py:class:`~poptorch.PipelinedExecution`. Other execution
   modes may benefit from it because the IPUs will spend less time updating
   parameters during training.


.. literalinclude:: device_iterations.py
  :caption: Use of gradient accumulation
  :start-after: gradient_acc_start
  :end-before: gradient_acc_end
  :emphasize-lines: 12
  :linenos:

In the code example below, :py:class:`~poptorch.Block` introduced in
:numref:`execution_strategies` is used to divide up
a different model into disjoint subsets of layers.
These blocks can be shared among multiple parallel execution strategies.

.. literalinclude:: mnist.py
  :language: python
  :linenos:
  :start-after: annotations_start
  :end-before: annotations_end
  :emphasize-lines: 12, 14, 16, 18, 34
  :caption: A training model making use of :py:class:`~poptorch.Block`

You can see the code examples of :py:class:`~poptorch.SerialPhasedExecution`,
:py:class:`~poptorch.PipelinedExecution`, and
:py:class:`~poptorch.ShardedExecution` below.

An instance of class :py:class:`~poptorch.PipelinedExecution` defines an
execution strategy that assigns layers to multiple IPUs as a pipeline. Gradient
accumulation is used to push multiple batches through the pipeline allowing
IPUs to run in parallel.

.. literalinclude:: mnist.py
  :caption: An example of different parallel execution strategies
  :language: python
  :linenos:
  :start-after: annotations_strategy_start
  :end-before: annotations_strategy_end
  :emphasize-lines: 6, 13, 19, 21


:numref:`figPipeline` shows the pipeline execution for multiple batches
on IPUs. There are 4 pipeline stages running on 4 IPUs respectively.
Gradient accumulation enables us to keep the same number of pipeline stages,
but with a wider pipeline.
This helps hide the latency, which is the total time for one item to go
through the whole system, as highlighted.

.. _figPipeline:
.. figure:: IPU-pipeline.jpg
   :width: 400

   Pipeline execution with gradient accumulation

.. _trainingOutputMode:

poptorch.Options.outputMode
==========================================

When you use a :py:func:`~poptorch.inferenceModel`, you will usually want to
receive all the output tensors. For this reason, PopTorch will return them
all to you by default. However, you can change this behaviour using
:py:func:`~poptorch.Options.outputMode`.

When you use a :py:func:`~poptorch.trainingModel`, you will often not need to
receive all or any of the output tensors and it is more efficient not to
receive them. For this reason, PopTorch only returns the last batch of tensors
by default. As in the the case of ``inferenceModel``, you can change this
behaviour using :py:func:`~poptorch.Options.outputMode`.

If you want to monitor training using a metric such as loss or accuracy, you
may wish to take into account all tensors. To do this with minimal or no
overhead, you can use ``poptorch.OutputMode.Sum``. For example:

 .. literalinclude:: sumAnchorReturnType.py
  :caption: A model which returns training accuracy as a tensor
  :language: python
  :linenos:
  :start-after: model_returning_accuracy_start
  :end-before: model_returning_accuracy_end

 .. literalinclude:: sumAnchorReturnType.py
  :caption: Efficient calculation of training accuracy across all batches
  :language: python
  :linenos:
  :start-after: sum_accuracy_start
  :end-before: sum_accuracy_end


================================================
FILE: docs/user_guide/buffers.py
================================================
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.
import torch
import poptorch


# counter_model_wrong_start
class CounterModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.i = torch.tensor([0.], dtype=torch.float)

    def forward(self):
        self.i += 1
        return self.i


model = CounterModel()
poptorch_model = poptorch.inferenceModel(model)
print(poptorch_model())  # tensor([1.])
print(poptorch_model())  # tensor([1.])
# counter_model_wrong_end

torch.testing.assert_close(model.i, torch.tensor([1.], dtype=torch.float))


# pragma pylint: disable=function-redefined,no-member
# counter_model_correct_start
class CounterModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.register_buffer("i", torch.tensor([0.], dtype=torch.float))

    def forward(self):
        self.i += 1
        return self.i


model = CounterModel()
poptorch_model = poptorch.inferenceModel(model)

print(poptorch_model())  # tensor([1.])
print(poptorch_model())  # tensor([2.])
# counter_model_correct_end

# Because the model is running in inference mode, we will need to manually
# call copyWeightsToHost
poptorch_model.copyWeightsToHost()
torch.testing.assert_close(model.i, torch.tensor([2.], dtype=torch.float))


================================================
FILE: docs/user_guide/debugging.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.
import torch
import poptorch


class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = torch.nn.Linear(10, 10)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(10, 10)
        self.loss = torch.nn.MSELoss(reduction="mean")

    def forward(self, x, labels=None):
        out = self.fc2(self.relu(self.fc1(x)))
        if self.training:
            return self.loss(out, labels)
        return out


# tensor_names_start
input = torch.rand(10, 10)
label = torch.rand(10, 10)

model = Model()
poptorch_model = poptorch.trainingModel(model)
poptorch_model(input, label)

tensor_names = poptorch_model.getTensorNames()
# tensor_names_end

# tensor_anchor_start
opts = poptorch.Options()
opts.anchorTensor('grad_bias', 'Gradient___fc2.bias')
opts.anchorTensor('update_weight', 'UpdatedVar___fc2.weight')
# tensor_anchor_end

poptorch_model.destroy()

# tensor_retrieve_start
poptorch_model = poptorch.trainingModel(model, opts)
poptorch_model(input, label)

grad = poptorch_model.getAnchoredTensor('grad_bias')
update = poptorch_model.getAnchoredTensor('update_weight')
# tensor_retrieve_end

poptorch_model.destroy()

# optim_state_dict_start
optim = poptorch.optim.SGD(model.parameters(), lr=0.01)
poptorch_model = poptorch.trainingModel(model, opts, optim)
poptorch_model(input, label)

state = optim.state_dict()
# optim_state_dict_end


================================================
FILE: docs/user_guide/debugging.rst
================================================
=====================
Debugging your model
=====================

Inspecting tensors
==================

PopTorch allows you to inspect arbitrary tensors in both inference and
training models. This is very useful for debugging conditions such as
overflows, underflows or vanishing gradients.

Numerous tensors are generated during model compilation. In order to inspect their values, you first have to find their names. You can
retrieve the complete list of tensor names in your model by calling
:py:func:`~poptorch.PoplarExecutor.getTensorNames`. Note that the model
must first be compiled.

.. literalinclude:: debugging.py
  :caption: Retrieving the list of tensor names
  :start-after: tensor_names_start
  :end-before: tensor_names_end
  :emphasize-lines: 8

Anchoring tensors
=================

Once you have chosen a few tensors of interest, the next step is to create anchors.
Anchoring enables a tensor to be observed by the application without it having to
be a model output.

You can create an anchor by calling :py:func:`~poptorch.Options.anchorTensor`.
It takes two mandatory string parameters: a convenient user-defined name for
the anchor and the name of the chosen tensor. Optionally, you may specify the
output mode as well as the output return period. In order for these option settings
to take effect, they must be set before model compilation.

In the example below, two anchors are created: one for a bias gradient tensor
and one for the updated weights of a linear layer.

.. literalinclude:: debugging.py
  :caption: Anchoring tensors
  :start-after: tensor_anchor_start
  :end-before: tensor_anchor_end
  :emphasize-lines: 2, 3

Retrieving tensors
==================

The anchored tensors will be updated after every model invocation. You can
retrieve their values using :py:func:`~poptorch.PoplarExecutor.getAnchoredTensor`.
The function takes a single parameter - the user-defined anchor name.

In the example below, we execute one training run and retrieve the values of
the two tensors we anchored previously.

.. literalinclude:: debugging.py
  :caption: Anchoring tensors
  :start-after: tensor_retrieve_start
  :end-before: tensor_retrieve_end
  :emphasize-lines: 4, 5

For a more practical understanding around observing tensors, the `Graphcore  GitHub examples repository <https://github.com/graphcore/examples>`__ contains a tutorial you can follow about observing tensors, using anchoring and generating a gradient histogram:
:tutorials-repo:`PopTorch tutorial: Observing tensors <tutorials/pytorch/observing_tensors>`.

Inspecting optimiser state
==========================

You can inspect the optimiser state without using anchoring. After you instantiate a
:py:func:`~poptorch.trainingModel`, the optimiser's `state_dict()` function will
return the internal optimiser's state. This state dictionary will be populated
when the training model is compiled, and is updated after each training step.

.. literalinclude:: debugging.py
  :caption: Inspecting optimiser state
  :start-after: optim_state_dict_start
  :end-before: optim_state_dict_end
  :emphasize-lines: 5

.. note:: The entries in PopTorch's optimiser `state_dict()` may differ from those in PyTorch in both name and structure.


================================================
FILE: docs/user_guide/device_iterations.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import sys
import poptorch

if not poptorch.ipuHardwareIsAvailable():
    print("Replicated top level graphs are not supported on the IPU model")
    sys.exit(0)
# pylint: disable=unused-variable, wrong-import-position, reimported, ungrouped-imports, wrong-import-order
# iterations_start
from functools import reduce
from operator import mul

import torch
import poptorch


class ExampleModelWithLoss(torch.nn.Module):
    def __init__(self, data_shape, num_classes):
        super().__init__()

        self.fc = torch.nn.Linear(reduce(mul, data_shape), num_classes)
        self.loss = torch.nn.CrossEntropyLoss()

    def forward(self, x, target=None):
        reshaped = x.reshape([x.shape[0], -1])
        fc = self.fc(reshaped)

        if target is not None:
            return fc, self.loss(fc, target)
        return fc


class ExampleDataset(torch.utils.data.Dataset):
    def __init__(self, shape, length):
        super().__init__()
        self._shape = shape
        self._length = length

        self._all_data = []
        self._all_labels = []

        torch.manual_seed(0)
        for _ in range(length):
            label = 1 if torch.rand(()) > 0.5 else 0
            data = torch.rand(self._shape) + label
            data[0] = -data[0]
            self._all_data.append(data)
            self._all_labels.append(label)

    def __len__(self):
        return self._length

    def __getitem__(self, index):
        return self._all_data[index], self._all_labels[index]


def device_iterations_example():
    # Set the number of samples for which activations/gradients are computed
    # in parallel on a single IPU
    model_batch_size = 2

    # Create a poptorch.Options instance to override default options
    opts = poptorch.Options()

    # Run a 100 iteration loop on the IPU, fetching a new batch each time
    opts.deviceIterations(100)

    # Set up the DataLoader to load that much data at each iteration
    training_data = poptorch.DataLoader(opts,
                                        dataset=ExampleDataset(shape=[3, 2],
                                                               length=10000),
                                        batch_size=model_batch_size,
                                        shuffle=True,
                                        drop_last=True)

    model = ExampleModelWithLoss(data_shape=[3, 2], num_classes=2)
    # Wrap the model in a PopTorch training wrapper
    poptorch_model = poptorch.trainingModel(model, options=opts)

    # Run over the training data, 100 batches at a time (specified in
    # opts.deviceIterations())
    for batch_number, (data, labels) in enumerate(training_data):
        # Execute the device with a 100 iteration loop of batchsize 2.
        # "output" and "loss" will be the respective output and loss of the
        # final batch (the default OutputMode).

        output, loss = poptorch_model(data, labels)
        print(f"{labels[-1]}, {output}, {loss}")
    # iterations_end
    poptorch_model.destroy()  # release the IPUs


# replication_start
def replication_factor_example():
    # Set the number of samples for which activations/gradients are computed
    # in parallel on a single IPU
    model_batch_size = 2
    # replication_start
    # Create a poptorch.Options instance to override default options
    opts = poptorch.Options()

    # Run a 100 iteration loop on the IPU, fetching a new batch each time
    opts.deviceIterations(100)

    # Duplicate the model over 4 replicas.
    opts.replicationFactor(4)

    training_data = poptorch.DataLoader(opts,
                                        dataset=ExampleDataset(shape=[3, 2],
                                                               length=100000),
                                        batch_size=model_batch_size,
                                        shuffle=True,
                                        drop_last=True)

    model = ExampleModelWithLoss(data_shape=[3, 2], num_classes=2)
    # Wrap the model in a PopTorch training wrapper
    poptorch_model = poptorch.trainingModel(model, options=opts)

    # Run over the training data, 100 batches at a time (specified in
    # opts.deviceIterations())
    for batch_number, (data, labels) in enumerate(training_data):
        # Execute the device with a 100 iteration loop of model batchsize 2
        # across 4 IPUs (global batchsize = 2 * 4 = 8). "output" and "loss"
        # will be the respective output and loss of the final batch of each
        # replica (the default OutputMode).
        output, loss = poptorch_model(data, labels)
        print(f"{labels[-1]}, {output}, {loss}")
    # replication_end
    poptorch_model.destroy()  # release the IPUs


# gradient_acc_start
def gradient_accumulation_example():
    # Set the number of samples for which activations/gradients are computed
    # in parallel on a single IPU
    model_batch_size = 2
    # Create a poptorch.Options instance to override default options
    opts = poptorch.Options()

    # Run a 400 iteration loop on the IPU, fetching a new batch each time
    opts.deviceIterations(400)

    # Accumulate the gradient 8 times before applying it.
    opts.Training.gradientAccumulation(8)

    training_data = poptorch.DataLoader(opts,
                                        dataset=ExampleDataset(shape=[3, 2],
                                                               length=100000),
                                        batch_size=model_batch_size,
                                        shuffle=True,
                                        drop_last=True)

    # Wrap the model in a PopTorch training wrapper
    poptorch_model = poptorch.trainingModel(model, options=opts)

    # Run over the training data, 400 batches at a time (specified in
    # opts.deviceIterations())
    for batch_number, (data, labels) in enumerate(training_data):
        # Execute the device with a 100 iteration loop of model batchsize 2
        # with gradient updates every 8 iterations (global batchsize = 2 * 8 = 16).
        # "output" and "loss" will be the respective output and loss of the
        # final batch of each replica (the default OutputMode).
        output, loss = poptorch_model(data, labels)
        print(f"{labels[-1]}, {output}, {loss}")
    # gradient_acc_end
    poptorch_model.destroy()  # release the IPUs


def data_accessor_example():
    # Not displayed: just to keep the linter happy
    shape = [3, 2]
    num_tensors = 100
    batch_size = 1
    num_workers = 0
    device_iterations = 1
    replication_factor = 1
    # Example starts here:
    # data_accessor_start
    opts = poptorch.Options()
    opts.deviceIterations(device_iterations)
    opts.replicationFactor(replication_factor)

    loader = poptorch.DataLoader(opts,
                                 ExampleDataset(shape=shape,
                                                length=num_tensors),
                                 batch_size=batch_size,
                                 num_workers=num_workers,
                                 mode=poptorch.DataLoaderMode.Async)

    poptorch_model = poptorch.inferenceModel(model, opts)

    for it, (data, _) in enumerate(loader):
        out = poptorch_model(data)
    # data_accessor_end
    poptorch_model.destroy()  # release the IPUs


model = ExampleModelWithLoss(data_shape=[3, 2], num_classes=2)
model_batch_size = 2


# distributed_execution_start
def process(process_id=0, num_processes=1):
    # Create a poptorch.Options instance to override default options
    opts = poptorch.Options()

    # Run a 100 iteration loop on the IPU, fetching a new batch each time
    opts.deviceIterations(400)

    # Replicate the graph across 2 IPUs in each process.
    opts.replicationFactor(2)

    # Set the id of the current process and the total number of processes.
    opts.Distributed.configureProcessId(process_id, num_processes)

    # Accumulate the gradient 8 times before applying it.
    opts.Training.gradientAccumulation(8)

    # Optional: All the processes must use the same seed if shuffle=True is used for the DataLoader.
    opts.randomSeed(42)

    training_data = poptorch.DataLoader(opts,
                                        dataset=ExampleDataset(shape=[3, 2],
                                                               length=100000),
                                        batch_size=model_batch_size,
                                        shuffle=True,
                                        drop_last=True)

    # Wrap the model in a PopTorch training wrapper
    poptorch_model = poptorch.trainingModel(model, options=opts)

    # Run over the training data with "batch_size" 200 essentially.
    for batch_number, (data, labels) in enumerate(training_data):
        # Execute the device with a 100 iteration loop of batchsize 8 across
        # 4 IPUs (batch-size 2 per replica). "output" and "loss" will be the
        # respective output and loss of the final batch of each replica
        # (the default OutputMode).
        output, loss = poptorch_model(data, labels)
        print(f"{batch_number} {labels[-1]}, {output}, {loss}")

    # distributed_execution_end
    poptorch_model.destroy()  # release the IPUs


# AsynchronousDataAccessor must run in the main process
if __name__ == "__main__":
    device_iterations_example()
    replication_factor_example()
    gradient_accumulation_example()
    data_accessor_example()
    process()


================================================
FILE: docs/user_guide/error_handling.py
================================================
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.
import torch
import poptorch
from poptorch.poptorch_core import TestErrorType


# pragma pylint: disable=broad-except
# This is a fake model which actually throws an exception
class PytorchModel(torch.nn.Module):
    def __init__(self, error):
        super().__init__()
        if error is not None:
            poptorch.poptorch_core._throwTestError(error)

    def forward(self, x, y):
        return x + y


def run_example(model_param=None):

    rebooted = False
    shutdown = False

    def reboot_server():
        nonlocal rebooted
        rebooted = True

    def shutdown_system():
        nonlocal shutdown
        shutdown = True

    # error_handling_start
    try:
        m = PytorchModel(model_param)
        inference_model = poptorch.inferenceModel(m)
        t1 = torch.tensor([1.])
        t2 = torch.tensor([2.])
        assert inference_model(t1, t2) == 3.0
    except poptorch.RecoverableError as e:
        print(e)
        if e.recovery_action == "FULL_RESET":
            reboot_server()
        elif e.recovery_action == "IPU_RESET":
            print("Need to reset the IPU")
        elif e.recovery_action == "PARITION_RESET":
            print("Need to reset the partition")
    except poptorch.UnrecoverableError as e:
        print(f"Unrecoverable error: machine needs to be taken offline: {e}")
        shutdown_system()
    except poptorch.Error as e:
        print(f"Received {e.message} from component {e.type}, "
              f"location: {e.location}")
        # Or you could just print all the information at once:
        print(e)
    except Exception as e:
        print(e)
    # error_handling_end
    if model_param == TestErrorType.PoplarRecoverableFullReset:
        assert rebooted
    elif model_param == TestErrorType.PoplarUnrecoverable:
        assert shutdown
    else:
        assert not rebooted
        assert not shutdown


if __name__ == "__main__":
    # Check the example is valid
    run_example()
    for t in TestErrorType.__members__.values():
        run_example(t)


================================================
FILE: docs/user_guide/example.rst
================================================
Examples
========

You can find PyTorch examples and tutorials in the Graphcore GitHub `examples repository <https://github.com/graphcore/examples>`__.
This contains:

* Examples of popular machine learning models for training and inference
* :tutorials-repo:`Tutorials <tutorials/pytorch>`
* :tutorials-repo:`Examples of PopTorch and IPU features <feature_examples/pytorch>`
* :tutorials-repo:`Examples of simple models <simple_applications/pytorch>`
* Source code from videos, blogs and other documents

MNIST example
_____________

The example in :numref:`mnist-example-code` shows how an MNIST model can be run on the IPU. The highlighted lines show the PopTorch-specific code required to run the example on multiple IPUs.

You can download the full source code from GitHub: :github-poptorch:`mnist.py <examples/mnist.py>`.

To run this example you will need to install the Poplar SDK (see the `Getting Started Guide <https://docs.graphcore.ai/en/latest/getting-started.html>`_ for your IPU system) and the appropriate version of ``torchvision``:

.. code-block:: console

    $ python3 -m pip install torchvision==0.11.1

.. literalinclude:: ../../examples/mnist.py
  :caption: MNIST example
  :name: mnist-example-code
  :start-after: mnist_start
  :end-before: mnist_end
  :emphasize-lines: 12, 15, 17, 20, 35, 96, 99
  :language: python
  :dedent: 3
  :linenos:
  :lineno-match:


================================================
FILE: docs/user_guide/experimental.rst
================================================
=====================
Experimental features
=====================

Distributed execution without PopRun
====================================

PopTorch supports distributed execution on a Pod using the IPU over Fabric
(IPUoF).

If you run a program using your own distributed processing tool instead of PopRun, the only change you need to make to your code is to set the ID of the current process and
the total number of processes the execution is distributed across, using
:py:meth:`~poptorch.options._DistributedOptions.configureProcessId`.

Note that :py:meth:`~poptorch.Options.replicationFactor` should
be used to set the number of local replicas (per host) not the total (global)
number of replicas.

.. literalinclude:: device_iterations.py
  :caption: Changes required for distributed execution
  :start-after: distributed_execution_start
  :end-before: distributed_execution_end
  :emphasize-lines: 9, 12, 18
  :linenos:

.. note:: ``DataLoader`` will automatically select a different subset of the
  dataset based on the process ID.

.. warning:: All the processes must use the same seed if ``shuffle=True`` is used
  for the ``DataLoader``.

torch.nn.CTCLoss
================

The CTCLoss operator is supported, with some limitations:

#. The ``reduction`` parameter must be set to either ``sum`` or ``mean``
#. The ``targets`` tensor must be 2D, corresponding to stacked, padded layout


================================================
FILE: docs/user_guide/hostio_optimisation.rst
================================================
=====================
Efficient IPU I/O
=====================

When developing applications for the IPU, maximising I/O performance is
important. If an application is still I/O-bound after optimising host data
loading, then you can explore further optimisations of the movement of data
into the IPU. This chapter will cover two specific optimisations that can
improve I/O performance:

* prefetch and multibuffering
* overlapping compute and I/O

Prefetch and multibuffering
===========================

Poplar supports prefetching and multibuffering to improve I/O performance.
For more details, see `Optimising host data transfers <https://docs.graphcore.ai/projects/poplar-user-guide/en/latest/poplar_programs.html#optimising-host-data-transfers>`__ in the Poplar and PopLibs User Guide.

Prefetch is enabled by default in Poplar. The default buffer depth is 1. You
can increase the value for the buffer depth to improve I/O performance:

.. code-block:: python

    opts = poptorch.Options()
    opts._Popart.set("defaultPrefetchBufferingDepth", 3)

Using multibuffering is especially useful when you see large ``StreamCopyBegin``
or ``StreamCopyEnd`` phases in your application's profile.

For example, :numref:`figNoBuffering` shows a profile of a simple program
without using buffering. The program consists of a loop where the IPU gets data
from the host, processes it and sends the result back. The ``StreamCopy``,
in light orange represents the data transfer. The first one is the host to IPU
transfer, the second one is the IPU to host transfer. They are split into a
``Begin``, a ``Mid``, and an ``End`` phase. In the ``Begin`` and ``End`` phases,
the IPU waits for the host to become ready. In the ``Mid`` phase the IPU
performs the transfer. Between the ``StreamCopy`` operations are the compute
steps, shown in red. In this profile, you can see the IPU is waiting for data
from the host for a significant amount of time.

.. figure:: no-buffering-profile.png
  :name: figNoBuffering
  :width: 100%

  Profile with multibuffering disabled

:numref:`figWithBuffering` shows the profile of the same program with
buffering. You can see that the IPU no longer waits for the host: the ``Begin``
and ``End`` section of the ``StreamCopy`` are gone.

.. figure:: with-buffering-profile.png
  :name: figWithBuffering
  :width: 100%

  Profile with multibuffering enabled and related improvements

Overlapping compute and I/O
===========================

To optimise I/O further, you can choose to dedicate a specified number of tiles
to communication and leave the rest of the tiles for compute. Computation
time will be adversely affected by having access to fewer tiles, so there is a
trade-off between optimising I/O and optimising compute here.

To overlap compute and I/O:

#. In PopTorch's ``Options``, you must specify the number of I/O tiles and
   select one of ``ShardedExecution``, ``ParallelPhasedExecution`` or
   ``SerialPhasedExecution`` as the ``ExecutionStrategy``:

    .. code-block:: python

        opts.TensorLocations.numIOTiles(64)
        opts.setExecutionStrategy(poptorch.ShardedExecution())

#. In the forward method of the model, you must set the ``OverlapMode``
   for the inputs and outputs of the model to ``OverlapDeviceIterationLoop``, as
   follows:

    .. code-block:: python

        def forward(self, x):
          x = poptorch.set_overlap_for_input(x, poptorch.OverlapMode.OverlapDeviceIterationLoop)
          x = some_compute(x)
          x = poptorch.set_overlap_for_output(x, poptorch.OverlapMode.OverlapDeviceIterationLoop)
          return x

:numref:`figWithBufferingOverlap` shows the profile of our simple program with
both compute I/O overlap and multibuffering enabled. The compute (in red) and
the I/O (in orange) are stacked on top of each other since they both happen at
the same time.

.. _figWithBufferingOverlap:
.. figure:: with-buffering-overlap-profile.png

  Profile with both multibuffering and I/O compute overlap enabled and related improvements


================================================
FILE: docs/user_guide/index.rst
================================================
PyTorch for the IPU: User Guide
===============================

.. toctree::
    :maxdepth: 4
    :numbered: 3

    intro
    installation
    pytorch_to_poptorch
    overview
    batching
    supported_ops
    debugging
    hostio_optimisation
    example
    experimental
    reference
    legal


================================================
FILE: docs/user_guide/inferenceModel.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import os
import poptorch
# If running on the model then make sure to run on the full size model to
# avoid running out of memory.
if not poptorch.ipuHardwareIsAvailable():
    os.environ["POPTORCH_IPU_MODEL"] = "1"

# pylint: disable=reimported
# pylint: disable=ungrouped-imports
# pylint: disable=wrong-import-order
# pylint: disable=wrong-import-position

# inference_model_start
import torch
import torchvision
import poptorch

# Some dummy imagenet sized input.
picture_of_a_cat_here = torch.randn([1, 3, 224, 224])

# The model, in this case a MobileNet model with pretrained weights that comes
# canned with PyTorch.
model = torchvision.models.mobilenet_v2(pretrained=True)
model.train(False)

# Wrap in the PopTorch inference wrapper
inference_model = poptorch.inferenceModel(model)

# Execute on IPU.
out_tensor = inference_model(picture_of_a_cat_here)

# Get the top 5 ImageNet classes.
top_five_classes = torch.topk(torch.softmax(out_tensor, 1), 5)
print(top_five_classes)

# Try the same on native PyTorch
native_out = model(picture_of_a_cat_here)

native_top_five_classes = torch.topk(torch.softmax(native_out, 1), 5)

# Models should be very close to native output although some operations are
# numerically different and floating point differences can accumulate.
assert any(top_five_classes[1][0] == native_top_five_classes[1][0])
# inference_half_start
model = torch.nn.Linear(1, 10)

# Cast the parameters (weights) to half.
model.half()

t1 = torch.tensor([1.]).half()

opts = poptorch.Options()

inference_model = poptorch.inferenceModel(model, opts)
out = inference_model(t1)

assert out.dtype == torch.half
# inference_half_end


================================================
FILE: docs/user_guide/installation.rst
================================================
.. _installation:

============
Installation
============

.. contents::
  :local:

PopTorch is included with the Poplar SDK (see the `Getting Started guide <https://docs.graphcore.ai/en/latest/getting-started.html>`_ for your system for how to install the Poplar SDK.).  PopTorch is packaged as a Python wheel
file that can be installed using ``pip``.

.. important:: pip >= 18.1 is required for PopTorch dependencies to be installed properly.

To update ``pip``:

.. code-block:: bash

    $ pip install -U pip


Version compatibility
=====================

The following are the corresponding ``torch``, ``torchvision``, ``torchaudio`` and
``torch_scatter`` versions and supported Python versions.

+--------------+-----------+-----------------+----------------+------------------------------+------------+
| ``poptorch`` | ``torch`` | ``torchvision`` | ``torchaudio`` |       ``torch_scatter``      | ``python`` |
+==============+===========+=================+================+==============================+============+
|     3.3      |   2.0.1   |      0.15.2     |      2.0.1     |   >=2.0.9 and <=2.1.1        |    >=3.8   |
+--------------+-----------+-----------------+----------------+------------------------------+------------+
|     3.2      |   1.13.1  |      0.14.1     |      0.13.1    |   >=2.0.9 and <=2.1.0        |    >=3.7   |
+--------------+-----------+-----------------+----------------+------------------------------+------------+
|     3.1      |   1.13.0  |      0.14.0     |      0.13.0    |   >=2.0.9 and <=2.1.0        |    >=3.7   |
+--------------+-----------+-----------------+----------------+------------------------------+------------+
|     3.0      |   1.10.0  |      0.11.1     |      0.10.0    |             N/A              |    >=3.6   |
+--------------+-----------+-----------------+----------------+------------------------------+------------+
|     2.6      |   1.10.0  |      0.11.1     |      0.10.0    |             N/A              |    >=3.6   |
+--------------+-----------+-----------------+----------------+------------------------------+------------+
|     2.5      |   1.10.0  |      0.11.1     |      0.10.0    |             N/A              |    >=3.6   |
+--------------+-----------+-----------------+----------------+------------------------------+------------+
|     2.4      |   1.10.0  |      0.11.1     |      0.10.0    |             N/A              |    >=3.6   |
+--------------+-----------+-----------------+----------------+------------------------------+------------+
|     2.3      |   1.9.0   |      0.10.0     |      0.9.0     |             N/A              |    >=3.6   |
+--------------+-----------+-----------------+----------------+------------------------------+------------+
|     2.2      |   1.9.0   |      0.10.0     |      0.9.0     |             N/A              |    >=3.6   |
+--------------+-----------+-----------------+----------------+------------------------------+------------+
|     2.1      |   1.7.1   |      0.8.2      |      0.7.1     |             N/A              |    >=3.6   |
+--------------+-----------+-----------------+----------------+------------------------------+------------+
|     2.0      |   1.7.1   |      0.8.2      |      0.7.1     |             N/A              |    >=3.6   |
+--------------+-----------+-----------------+----------------+------------------------------+------------+
|     1.4      |   1.6.0   |      0.7.0      |      0.6.0     |             N/A              |    >=3.6   |
+--------------+-----------+-----------------+----------------+------------------------------+------------+

Based on https://github.com/pytorch/vision/blob/master/README.md

.. note:: To ensure version compatibility, ``torchvision`` and ``torchaudio`` are automatically installed with PopTorch in Poplar SDK 3.3 and later.


Using a Python virtual environment
==================================

We recommend creating and activating a virtual environment to isolate your PopTorch environment
from the system Python environment. You can use the Python tool ``virtualenv``
for this. You can create a virtual environment and install PopTorch as shown below:

.. code-block:: bash

    $ virtualenv -p python3 poptorch_test
    $ source poptorch_test/bin/activate
    $ pip install -U pip
    $ pip install <sdk_path>/poptorch_x.x.x.whl


.. _setting_env:

Setting the environment variables
=================================

The PopART and Poplar runtime libraries are required to use PopTorch, so you
will need to set the library search paths, using the scripts provided in the SDK:

.. code-block:: bash

    # Enable the Python environment containing PopTorch (if not already enabled)
    $ source poptorch_test/bin/activate

    # Add the Poplar and PopART runtime libraries to the search path
    $ source <sdk_path>/poplar-ubuntu_<os_ver>-<poplar_ver>+<build>/enable.sh
    $ source <sdk_path>/popart-ubuntu_<os_ver>-<poplar_ver>+<build>/enable.sh

where ``<sdk_path>`` is the location of the Poplar SDK on your system. ``<os_ver>`` is the version of Ubuntu on your system, ``<poplar_ver>`` is the software version number of the Poplar SDK and ``<build>`` is the build information.


Validating the setup
====================

You can run this simple example to verify that the system is working as
expected. This example can be found in the Poplar SDK ``examples`` directory.

.. literalinclude:: ../../examples/simple_adder.py
  :caption: Simple adder example
  :language: python
  :linenos:


================================================
FILE: docs/user_guide/intro.rst
================================================
============
Introduction
============

PopTorch is a set of extensions for PyTorch to enable PyTorch models to run directly
on the Graphcore IPU. PopTorch has been designed to require as few changes as
possible to your models in order to run on the IPU. However, it does have some
differences from native PyTorch execution, to get the most out of IPU hardware.
The `IPU Programmer's Guide <https://docs.graphcore.ai/projects/ipu-programmers-guide/>`__ provides an introduction to the IPU architecture, programming model and tools available.

PopTorch is included with the `Poplar SDK <https://docs.graphcore.ai/projects/sdk-overview/>`__.
See the `Getting Started guide <https://docs.graphcore.ai/en/latest/getting-started.html>`_ for your system for how to
install the Poplar SDK. Refer to :numref:`installation` for how to install the PopTorch wheel.

In the Graphcore software stack, PyTorch sits at the highest level of
abstraction. Poplar and PopLibs provide a software interface to operations
running on the IPU. PopTorch compiles PyTorch models into Poplar executables and
also provides IPU-specific functions.

.. figure:: pytorch-software-stack.png
    :align: center
    :width: 100%

    PyTorch, PopTorch and the Poplar software stack

PopTorch supports executing native PyTorch models for both inference and training.
To run a PyTorch model on the IPU, you must wrap your model with either:

* :py:func:`~poptorch.inferenceModel`
* :py:func:`~poptorch.trainingModel`

Both of these functions accept a PyTorch model (`torch.nn.Module <https://pytorch.org/docs/1.10.0/generated/torch.nn.Module.html#torch.nn.Module>`_) and create a
representation of the model that can be executed on the IPU hardware.

In training mode, PopTorch uses its own automatic differentiation engine
(autograd) that differs from native PyTorch.  The input model (`torch.nn.Module <https://pytorch.org/docs/1.10.0/generated/torch.nn.Module.html#torch.nn.Module>`_)
is required to have at least one loss built into the forward pass.  PopTorch
backpropagates the gradients from the loss value(s) to update the model
parameters. This is all taken care of automatically so your training loop does not
need to call ``.backward()`` on the loss value(s) or ``.step()`` on the optimiser.

The following example shows a typical native PyTorch training loop.  The model
incorporates a loss criterion within the ``.forward()`` method, and returns the loss
value as a second output (along with the prediction).  This native PyTorch training
loop manually invokes the ``.backward()`` method to backpropagate the gradients.
The loop also manually updates the optimiser by calling the ``.step()`` method.

.. literalinclude:: poptorch_training_simple.py
  :caption: A simple example of training using PyTorch on the CPU
  :linenos:
  :start-after: simple_cpu_start
  :end-before: simple_cpu_end


Data batching
=============

An equivalent training loop executing the model on the IPU with PopTorch is shown
below. The :py:class:`~poptorch.DataLoader` class is used to efficiently load data batches
on the IPU.  PopTorch follows the data batching semantics of `PopART <https://docs.graphcore.ai/projects/popart-user-guide/>`__. By default,
this means you will just pass in data of the normal batch size. However, there are a
number of options provided in PopTorch which will enable more efficient data
loading. See :numref:`efficient_data_batching` for more information.

Notice that the `torch.optim.AdamW <https://pytorch.org/docs/1.10.0/optim.html#torch.optim.AdamW>`_ optimiser is passed as an input argument to the
:py:func:`~poptorch.trainingModel` wrapper which applies the optimiser algorithm
during training on the IPU.  The optimiser state is automatically managed by the
PopART framework so there is no need to call the ``.step()`` method.  Another
significant change from the native training loop is there is no ``loss.backward()``.
As mentioned above, PopTorch uses its own automatic differentiation engine and will
detect the loss value to backpropagate the gradients from.

.. literalinclude:: poptorch_training_simple.py
  :caption: Equivalent code using PopTorch to train on the IPU
  :linenos:
  :start-after: simple_ipu_start
  :end-before: simple_ipu_end


Parallel and Distributed execution
==================================

To scale your models, you can enable :ref:`execution_strategies` using the
PopTorch :ref:`annotation_tools` to label or wrap individual parts of your
model and assign parts of the model to an individual IPU or execution phase.
You can also use PopTorch's :ref:`available_execution_strategies` to determine how the
model executes the phases.

Having assigned the model to run on one or more IPUs, you can add additional
parallelism with replication. Each replica represents an additional copy of the
entire model. These copies run in parallel.

PopTorch can also run across multiple hosts. This is necessary for using more
than 64 IPUs across IPU Pod systems and may be beneficial when using a smaller number
of IPUs, for example with models that involve intensive pre-processing on the CPU. We
recommend using the PopRun command-line tool and and PopDist configuration
library, which can automatically set up PopTorch to run across multiple IPU-POD
hosts. Refer to the `PopDist and PopRun User Guide
<https://docs.graphcore.ai/projects/poprun-user-guide/>`__ for more information,
including details about the installation of Horovod if you are using the MPI
communication protocol.

.. _constraints:

Constraints
===========

The following constraints apply when using PopTorch:

* All tensor data types and shapes must be constant for the entire dataset.

* As PopTorch compiles to a static graph, it cannot handle control flow
  variations within the model. This means that the inputs passed at run-time
  cannot vary the control flow of the model or the shapes or sizes of results.
  If this is attempted, the graph will be frozen to whichever control flow path
  was activated as a result of the first inputs given to the wrapped model.

* Not all PyTorch operations are implemented within the PopTorch compiler.  See
  :numref:`supported_ops` for a list of operators that are supported on the IPU.
  Please also report any unsupported operators to support@graphcore.ai so that these
  ops may be incorporated into a future release.

* Whilst any argument type can be used in the forward method, only tensor
  arguments may change between model invocations, as other types will be
  statically compiled inside the executable.


Other resources
===============

`Switching from GPUs to IPUs for Machine Learning Models <https://docs.graphcore.ai/projects/differences-ipu-gpu/>`__ provides a high-level overview of the programming changes required when switching from GPUs to IPUs and `Memory and Performance Optimisation on the IPU <https://docs.graphcore.ai/projects/memory-performance-optimisation/>`__ presents guidelines to help you develop high-performance machine learning models running on the IPU.

The Graphcore `Examples GitHub repository <https://github.com/graphcore/examples>`_ contains PopTorch applications, :tutorials-repo:`feature examples <feature_examples/pytorch>`,
:tutorials-repo:`tutorials <tutorials/pytorch>` and :tutorials-repo:`simple applications <simple_applications/pytorch>`.
Further developer resources can be found on `Graphcore's developer portal <https://www.graphcore.ai/developer>`_.


================================================
FILE: docs/user_guide/legal.rst
================================================
Trademarks & copyright
======================

|LEGAL:TRADEMARKS|

|LEGAL:EULA|

Copyright © 2020-|YEAR| Graphcore Ltd. All rights reserved.


================================================
FILE: docs/user_guide/mnist.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import argparse
import sys
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from tqdm import tqdm
import poptorch


def get_mnist_data(opts):
    options = poptorch.Options()
    training_data = poptorch.DataLoader(
        options,
        torchvision.datasets.MNIST('mnist_data/',
                                   train=True,
                                   download=True,
                                   transform=torchvision.transforms.Compose([
                                       torchvision.transforms.ToTensor(),
                                       torchvision.transforms.Normalize(
                                           (0.1307, ), (0.3081, ))
                                   ])),
        batch_size=opts.batch_size * opts.batches_per_step,
        shuffle=True,
        drop_last=True)

    validation_data = poptorch.DataLoader(
        options,
        torchvision.datasets.MNIST('mnist_data/',
                                   train=False,
                                   download=True,
                                   transform=torchvision.transforms.Compose([
                                       torchvision.transforms.ToTensor(),
                                       torchvision.transforms.Normalize(
                                           (0.1307, ), (0.3081, ))
                                   ])),
        batch_size=opts.test_batch_size,
        shuffle=True,
        drop_last=True)
    return training_data, validation_data


#annotations_start
class Network(nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.layer1 = nn.Linear(784, 784)
        self.layer2 = nn.Linear(784, 784)
        self.layer3 = nn.Linear(784, 128)
        self.layer4 = nn.Linear(128, 10)
        self.softmax = nn.Softmax(1)

    def forward(self, x):
        x = x.view(-1, 784)
        with poptorch.Block("B1"):
            x = self.layer1(x)
        with poptorch.Block("B2"):
            x = self.layer2(x)
        with poptorch.Block("B3"):
            x = self.layer3(x)
        with poptorch.Block("B4"):
            x = self.layer4(x)
            x = self.softmax(x)
        return x


class TrainingModelWithLoss(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.loss = torch.nn.CrossEntropyLoss()

    def forward(self, args, loss_inputs=None):
        output = self.model(args)
        if loss_inputs is None:
            return output
        with poptorch.Block("B4"):
            loss = self.loss(output, loss_inputs)
        return output, loss


#annotations_end


def accuracy(predictions, labels):
    _, ind = torch.max(predictions, 1)
    # provide labels only for samples, where prediction is available (during the training, not every samples prediction is returned for efficiency reasons)
    labels = labels[-predictions.size()[0]:]
    accuracy = torch.sum(torch.eq(ind, labels)).item() / \
        labels.size()[0] * 100.0
    return accuracy


def train(training_model, training_data, opts):
    nr_batches = len(training_data)
    for epoch in range(1, opts.epochs + 1):
        print("Epoch {0}/{1}".format(epoch, opts.epochs))
        bar = tqdm(training_data, total=nr_batches)
        for data, labels in bar:
            preds, losses = training_model(data, labels)
            with torch.no_grad():
                mean_loss = torch.mean(losses).item()
                acc = accuracy(preds, labels)
            bar.set_description("Loss:{:0.4f} | Accuracy:{:0.2f}%".format(
                mean_loss, acc))
            if opts.profile:
                return


def test(inference_model, test_data):
    nr_batches = len(test_data)
    sum_acc = 0.0
    with torch.no_grad():
        for data, labels in tqdm(test_data, total=nr_batches):
            output = inference_model(data)
            sum_acc += accuracy(output, labels)
    print("Accuracy on test set: {:0.2f}%".format(sum_acc / len(test_data)))


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='MNIST training in PopTorch')
    parser.add_argument('--batch-size',
                        type=int,
                        default=4,
                        help='batch size for training (default: 4)')
    parser.add_argument('--batches-per-step',
                        type=int,
                        default=8,
                        help='device iteration (default:8)')
    parser.add_argument('--test-batch-size',
                        type=int,
                        default=2,
                        help='batch size for testing (default: 4)')
    parser.add_argument('--epochs',
                        type=int,
                        default=1,
                        help='number of epochs to train (default: 1)')
    parser.add_argument('--lr',
                        type=float,
                        default=1e-4,
                        help='learning rate (default: 1e-4)')
    parser.add_argument(
        '--profile',
        type=str,
        help=
        "do a single iteration of training for profiling and place in a folder"
    )
    parser.add_argument('--strategy',
                        choices=['plain', 'pipelined', 'phased'],
                        default='plain',
                        help='execution strategy')
    parser.add_argument('--offload-opt',
                        type=bool,
                        help="offload optimizer state")
    opts = parser.parse_args()

    poptorch.setLogLevel("DEBUG")  # Force debug logging

    #annotations_strategy_start
    training_data, test_data = get_mnist_data(opts)
    model = Network()
    model_with_loss = TrainingModelWithLoss(model)
    model_opts = poptorch.Options().deviceIterations(1)
    if opts.strategy == "phased":
        strategy = poptorch.SerialPhasedExecution("B1", "B2", "B3", "B4")
        strategy.stage("B1").ipu(0)
        strategy.stage("B2").ipu(0)
        strategy.stage("B3").ipu(0)
        strategy.stage("B4").ipu(0)
        model_opts.setExecutionStrategy(strategy)
    elif opts.strategy == "pipelined":
        strategy = poptorch.PipelinedExecution("B1", "B2", "B3", "B4")
        strategy.stage("B1").ipu(0)
        strategy.stage("B2").ipu(1)
        strategy.stage("B3").ipu(2)
        strategy.stage("B4").ipu(3)
        model_opts.setExecutionStrategy(strategy)
        model_opts.Training.gradientAccumulation(opts.batches_per_step)
    else:
        strategy = poptorch.ShardedExecution("B1", "B2", "B3", "B4")
        strategy.stage("B1").ipu(0)
        strategy.stage("B2").ipu(0)
        strategy.stage("B3").ipu(0)
        strategy.stage("B4").ipu(0)
        model_opts.setExecutionStrategy(strategy)

    if opts.offload_opt:
        model_opts.TensorLocations.setActivationLocation(
            poptorch.TensorLocationSettings().useOnChipStorage(True))
        model_opts.TensorLocations.setWeightLocation(
            poptorch.TensorLocationSettings().useOnChipStorage(True))
        model_opts.TensorLocations.setAccumulatorLocation(
            poptorch.TensorLocationSettings().useOnChipStorage(True))
        model_opts.TensorLocations.setOptimizerLocation(
            poptorch.TensorLocationSettings().useOnChipStorage(False))

    training_model = poptorch.trainingModel(
        model_with_loss,
        model_opts,
        optimizer=optim.AdamW(model.parameters(), lr=opts.lr))

    # run training, on IPU
    train(training_model, training_data, opts)
    #annotations_strategy_end

    if opts.profile:
        sys.exit(1)

    # Update the weights in model by copying from the training IPU. This updates (model.parameters())
    training_model.copyWeightsToHost()

    # Check validation loss on IPU once trained. Because PopTorch will be compiled on first call the
    # weights in model.parameters() will be copied implicitly. Subsequent calls will need to call
    # inference_model.copyWeightsToDevice()
    inf_opts = poptorch.Options().deviceIterations(opts.test_batch_size)
    strategy = poptorch.ShardedExecution("B1", "B2", "B3", "B4")
    strategy.stage("B1").ipu(0)
    strategy.stage("B2").ipu(0)
    strategy.stage("B3").ipu(0)
    strategy.stage("B4").ipu(0)
    inf_opts.setExecutionStrategy(strategy)

    inference_model = poptorch.inferenceModel(model, inf_opts)
    test(inference_model, test_data)


================================================
FILE: docs/user_guide/overview.rst
================================================
========
Features
========

.. contents::
  :local:
  :depth: 3

Options
=======

You can change how PopTorch compiles and executes models using :py:class:`~poptorch.Options`.
You can find a full list of options in :numref:`api_options`.
Broadly speaking, the options fall into the following categories:

#. General options (see :py:class:`~poptorch.Options`)
#. Options related to half precision (see :py:class:`opts.Precision.* <poptorch.options._PrecisionOptions>`)
#. Management of the training process (see :py:class:`opts.Training.* <poptorch.options._TrainingOptions>`)
#. Location of tensors (see: :py:class:`opts.TensorLocations.* <poptorch.options._TensorLocationOptions>` and
   :py:class:`~poptorch.TensorLocationSettings`)
#. Options relevant to the Torch JIT compiler
   (see :py:class:`opts.Jit.* <poptorch.options._JitOptions>`)
#. Control of distributed execution environments when using tools other than
   `PopRun <https://docs.graphcore.ai/projects/poprun-user-guide/>`__
   (see :py:class:`opts.Distributed.* <poptorch.options._DistributedOptions>`)

See :numref:`efficient_data_batching` for a full
explanation of how :py:meth:`~poptorch.Options.deviceIterations`,
:py:meth:`~poptorch.options._TrainingOptions.gradientAccumulation` and
:py:meth:`~poptorch.Options.replicationFactor` interact with a model's input and
output sizes.

You can choose to use the IPU Model instead of IPU hardware
with the :py:meth:`~poptorch.Options.useIpuModel` option.

Setting options via config file
-------------------------------

In addition to setting these options programmatically, you can also set them in a
config text file by using :py:func:`~poptorch.Options.loadFromFile`.

Each line in the file must contain a single command corresponding to setting an option
in :py:class:`~poptorch.Options`. To set an option within the file, write the command as you
would within a Python script but omit the ``options.`` prefix. For example:

.. literalinclude:: poptorch.conf
    :language: python
    :caption: Example contents of a config file used to set options
    :linenos:

Then, instantiate :py:class:`~poptorch.Options` and call :py:func:`~poptorch.Options.loadFromFile`:

.. literalinclude:: api.py
    :language: python
    :caption: Setting options using a config file named "poptorch.conf"
    :linenos:
    :start-after: conf_load_start
    :end-before: conf_load_end
    :emphasize-lines: 2

Model wrapping functions
========================


The basis of PopTorch integration comes from the two model wrapping functions
described in the following sections.

.. note:: PopTorch makes a shallow copy of the model. Changes to the parameters
    in the models returned by these two model wrapping functions affect the
    original model and vice versa. However, primitive variable types will not be
    kept in sync. This includes the ``training`` bool of ``pytorch.nn.Module``.
    If your PyTorch model is named ``model``, call ``model.eval()`` or
    ``model.train()``, if required, before calling these wrapping functions.


poptorch.trainingModel
----------------------

This function wraps a PyTorch model, yielding a PopTorch model that can
be run on the IPU in training mode. See :py:func:`~poptorch.trainingModel` for
more information.

.. literalinclude:: trainingModel.py
    :language: python
    :caption: An example of the use of trainingModel
    :linenos:
    :emphasize-lines: 22
    :start-after: training_model_start
    :end-before: training_model_end

.. note:: By default, PopTorch will only return the final batch of outputs.
   Please see :numref:`trainingOutputMode` for details on what PopTorch returns
   when using :py:func:`~poptorch.trainingModel` and how you can calculate
   statistics such as training accuracy over all batches.

poptorch.inferenceModel
-----------------------

This function wraps a PyTorch model, yielding a PopTorch model that can
be run on the IPU in inference mode. See :py:func:`~poptorch.inferenceModel` for
more information.

.. literalinclude:: inferenceModel.py
    :language: python
    :caption: An example of the use of inferenceModel
    :linenos:
    :start-after: inference_model_start
    :emphasize-lines: 14


poptorch.PoplarExecutor
-----------------------

You should not create this class directly. It is a wrapper around the model
that was passed into :py:func:`~poptorch.inferenceModel` or :py:func:`~poptorch.trainingModel`.
It has a few methods which you can use to interface with the IPU.

The :py:class:`~poptorch.PoplarExecutor` will implicitly keep in sync the
parameters of the source PyTorch model and the PopTorch model(s). However, you
need to explicitly copy the weights before you run a model on the IPU if you
train the model on the CPU after you have already wrapped it for the IPU. You
also need to explicitly copy the weights if you alter an already wrapped model
parameter by some other means.

See :py:class:`~poptorch.PoplarExecutor` for a complete description of the IPU interface
functionality.

.. literalinclude:: trainingModel.py
    :language: python
    :caption: Example contents of when explicit copies are needed
    :linenos:
    :start-after: explicit_copy_start
    :end-before: explicit_copy_end

poptorch.isRunningOnIpu
-----------------------

One useful utility function is :py:func:`~poptorch.isRunningOnIpu`. This
returns ``True`` when executing on the IPU and ``False`` when executing
the model outside IPU scope. This allows for different code paths within
the model.

A common use case is executing equivalent code to a PopART custom operator
when running on the CPU. For example:

.. code-block:: python

    class Network(torch.nn.Module):
      def forward(self, x, y):
          if poptorch.isRunningOnIpu():
              # IPU path
              return my_custom_operator(x, y)
          else:
              # CPU path
              return my_torch_implementation(x,y)


Error handling
==============

Recoverable runtime errors
--------------------------

This category of error is likely to be transient.

Exception type raised by PopTorch: `poptorch.RecoverableError` (inherits from `poptorch.Error`)

The exception contains the action required to recover from this error in its `recovery_action` string attribute.

This attribute can contain:

 - `IPU_RESET`: Reset the IPU and reload the IPU memory.
 - `LINK_RESET`: Reset the IPU-Links in a non-Pod system. This retrains the IPU-Links between IPUs.
 - `PARTITION_RESET`: Reset the IPU partition in a Pod system. This retrains the IPU-Links between IPUs.
 - `FULL_RESET`: Power cycle the system.

Unrecoverable runtime errors
----------------------------

These errors are likely to persist. You should take the system out of operation for analysis and repair.

Exception type raised by PopTorch: `poptorch.UnrecoverableError` (inherits from `poptorch.Error`)

Application and other errors
----------------------------

This kind of error is due to an error in the program or a misuse of an API.

Exception type raised by PopTorch: `poptorch.Error` if the error was detected in the C++ backend, or some generic Python `Exception` if it happened in the Python layer.

`poptorch.Error` has the following string attributes:

 - `message` The error message without any of the context.
 - `type` The part of the software stack that raised the exception and the category of the error if available.
 - `location` Where the exception was raised.

Example:

.. literalinclude:: error_handling.py
    :language: python
    :linenos:
    :start-after: error_handling_start
    :end-before: error_handling_end
    :caption: How to handle recoverable / unrecoverable errors

.. _execution_strategies:

Multi-IPU execution strategies
==============================

This section describes strategies to run PopTorch code on more than one IPU.
Some of these allow you to run code in parallel on multiple IPUs.
You will need to use one of these execution strategies for PopTorch code that
does not fit on a single IPU, but if you do not explicitly select one,
PopTorch will use the default execution strategy,
:py:class:`~poptorch.PipelinedExecution`.

.. note:: In general, we advise pipelining over as few IPUs as possible.
  However, You may need to experiment to find the optimal pipeline length.
  In some corner cases, a longer pipeline can lead to faster throughput.

There are four kinds of execution strategies that you can use to run a model on a
multi-IPU device:

* :py:class:`~poptorch.PipelinedExecution`
* :py:class:`~poptorch.ShardedExecution`
* :py:class:`~poptorch.SerialPhasedExecution`
* :py:class:`~poptorch.ParallelPhasedExecution`

You can select this with the
:py:func:`~poptorch.Options.setExecutionStrategy` option.


The following subsections first introduce the general functions which are
relevant to all four parallel execution strategies. Next, they explain the
four strategies with examples.

By default, PopTorch will not let you run the model if the number of IPUs is
not a power of 2.
For this reason, it is preferable to annotate the model so that the number of
IPUs used is a power of 2.
However, you can also enable :py:func:`~poptorch.Options.autoRoundNumIPUs` to
automatically round up the number of IPUs reserved to a power of 2, with the
excess being reserved but idle.
This option is not enabled by default to prevent unintentional overbooking of
IPUs.

.. _annotation_tools:

Annotations
-----------

In PopTorch, you can divide a model into blocks. Blocks are associated to stages and
stages can be grouped into phases. This chapter will describe how to define them
and how to use them to set up different execution modes.

.. _figStages:
.. figure:: stages_summary.png
   :width: 581

   PopTorch model partition summary


Model partitioning using blocks
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

:py:class:`~poptorch.BeginBlock` is a wrapper class, :py:class:`~poptorch.Block`
is a context manager, and :py:func:`~poptorch.BlockFunction` is a function
decorator. You can use one or more of these to partition models into "blocks"
that can be executed on different IPUs.

You can use :py:class:`~poptorch.BeginBlock` to annotate an existing model. Each
call, with example arguments ``(layer_n, ipu_id=m)``, places layers enclosed in
``layer_n`` on IPU ``m``. Note that, PopTorch places the first layers on
``ipu_id`` 0 by default. However, layers in between
:py:class:`~poptorch.BeginBlock` annotations will inherit that of the previous
annotated block.

.. literalinclude:: pipeline_simple.py
    :language: python
    :linenos:
    :start-after: annotations_start
    :end-before: annotations_end
    :emphasize-lines: 37-38, 41-42, 45-46
    :caption: Annotating existing layers

.. note:: The :py:class:`~poptorch.BeginBlock` annotations internally use PyTorch hooks.
    If the module passed to :py:func:`~poptorch.BeginBlock` uses hooks, for example with
    `register_forward_pre_hook <https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.register_forward_pre_hook>`__,
    then the assignment of operations to blocks may depend on the order those hooks are added.
    A concrete example may help to clarify this: consider a layer, and an operation that is defined in a hook function.
    If ``register_forward_pre_hook()`` is called on the layer, followed by a call to :py:func:`~poptorch.BeginBlock` passing the same layer as argument,
    then the operation defined in the hook will be assigned to the preceding block (so not the same block as the layer).
    If instead the call to :py:func:`~poptorch.BeginBlock` happens before ``register_forward_pre_hook()``, then the operation
    will be assigned in the same block as the layer.

You can use :py:class:`~poptorch.Block` to annotate a model from within its
definition. This context manager class defines a scope in the context of
the model. Everything within that scope is placed on the IPU specified (unless
overridden by a :py:class:`~poptorch.Stage`).

.. literalinclude:: pipeline_simple.py
    :language: python
    :linenos:
    :start-after: annotations_inline_start
    :end-before: annotations_inline_end
    :emphasize-lines: 16, 19, 22, 26
    :caption: Annotating a model directly

In addition, you can use the :py:func:`~poptorch.BlockFunction` function decorator
to place functions (containing one or more layers) onto a particular block.
Everything within that function is placed on the IPU specified (unless
overridden by a :py:class:`~poptorch.Stage`).

.. literalinclude:: pipeline_simple.py
    :language: python
    :linenos:
    :start-after: annotations_decorator_start
    :end-before: annotations_decorator_end
    :emphasize-lines: 19, 25
    :caption: Annotating functions

You can use any, or a combination, of these three annotation options.
In the above examples, ``ipu_id`` is used to specify blocks. This alone is
sufficient to enable parallel execution: by default,
:py:class:`~poptorch.AutoStage` will set up a pipeline for which the pipeline
stage is equal to the ``ipu_id`` for each block. However, it would be equally
valid to instead use the ``user_id`` argument to assign names to each block.
Then, using :py:class:`~poptorch.Stage` or :py:class:`~poptorch.Phase` classes,
you can manually assign each block in a pipeline using their names, as outlined
in the next sections.

:py:class:`~poptorch.BeginBlock`, :py:class:`~poptorch.Block` and
:py:func:`~poptorch.BlockFunction`  need to follow a set of rules:

* You must declare all the layers inside a :py:class:`~poptorch.Block` scope,
  using either the context manager or :py:func:`~poptorch.BlockFunction`, to
  avoid missing annotations. :py:class:`~poptorch.BeginBlock` does not have this
  constraint because all the layers called after this will
  automatically be added to the last :py:class:`~poptorch.BeginBlock`.
* Note that PopTorch needs to reserve IPUs in powers of 2. You are
  advised to configure your model accordingly to take full advantage of the IPUs
  available. However, if you need to run with a different number of IPUs, you
  can use ``poptorch.Options().autoRoundNumIPUs(True)`` to allow PopTorch to
  reserve more IPUs than the model specifies.
* You should not include unused or dead layers in any
  :py:class:`~poptorch.BeginBlock` or :py:class:`~poptorch.Block`.
* If layer A happens before layer B inside the model and each layer has
  a :py:class:`~poptorch.BeginBlock` associated with it,
  you need to write :py:class:`~poptorch.BeginBlock` for layer A before
  :py:class:`~poptorch.BeginBlock` for layer B.

Failing to obey above rules will result in compilation errors.


poptorch.Stage and poptorch.AutoStage
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Conceptually, :py:class:`~poptorch.BeginBlock` and
:py:class:`~poptorch.Block` collect the
layers of a model into a :py:class:`~poptorch.Stage`.
You can combine multiple stages into a :py:class:`~poptorch.Phase`.
Multiple phases form an execution strategy.

poptorch.Stage
""""""""""""""

:py:class:`~poptorch.Stage` defines the layers of the model to run on one IPU.
A stage can consist of one or more blocks created using
:py:class:`~poptorch.BeginBlock` or :py:class:`~poptorch.Block`
and identified by their ``user_id``.

You can define consecutive layers in a model in either the same
stage or consecutive stages.
Whether stages run in parallel or sequentially depends on the specific
execution strategy.

Internally, each operation in a model is assigned a ``stage_id``
through :py:class:`~poptorch.Stage`.

poptorch.AutoStage
""""""""""""""""""

You can use :py:class:`~poptorch.AutoStage` if you don't want to
specify stages by hand.
This will assign one :py:class:`~poptorch.Stage`
per :py:class:`~poptorch.BeginBlock` or :py:class:`~poptorch.Block`.

By default, ``AutoStage.SameAsIpu`` is true, which means the
``stage_id`` of the :py:class:`~poptorch.Stage` will be set to the ``ipu_id``
specified for the :py:class:`~poptorch.BeginBlock` or
:py:class:`~poptorch.Block`.

Note that ``stage_id`` must have ascending values in
:py:class:`~poptorch.PipelinedExecution`.
Let's use the code example above.
If your blocks "0", "1", and "2" are assigned to IPU 0, 1, and 0.
Then the :py:class:`~poptorch.Block`
"2" will be assigned ``stage_id`` 0. This will cause
the compiler to fail to
schedule the last two stages "1" and "2" due to a conflict:

* The model implies "1" should run earlier than "2"
* Their ``stage_id`` values suggest "2" should run earlier than "1"

When ``AutoStage.AutoIncrement`` is true, each new
:py:class:`~poptorch.BeginBlock` or
:py:class:`~poptorch.Block` will be assigned an automatically incremented
``stage_id``.
In the previous example the last stage would be assigned ``stage_id`` 2 and
the compilation would succeed.

poptorch.Phase
^^^^^^^^^^^^^^

:py:class:`~poptorch.Phase` defines a processing unit of phased execution.
It can contain one or more :py:class:`~poptorch.Stage` stages.

:py:class:`~poptorch.Phase` is only used in
:py:class:`~poptorch.SerialPhasedExecution` and
:py:class:`~poptorch.ParallelPhasedExecution`.
It is not used in
:py:class:`~poptorch.ShardedExecution` and
:py:class:`~poptorch.PipelinedExecution`.

.. literalinclude:: phased_execution.py
    :language: python
    :caption: Example of Stage declaration
    :linenos:
    :start-after: stage_start
    :end-before: stage_end

In the code snippet above, "A" and "B" will run in parallel on IPUs 0 and 1
simultaneously because they are placed in two stages. They will run
sequentially on one IPU if they are placed in a single stage.


Advanced annotation with strings
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

You can use Python strings to represent the ``user_id`` and ``ipu_id`` for a
:py:class:`~poptorch.Block` or
:py:class:`~poptorch.BeginBlock`.
Because strings are evaluated at runtime,
they allow for a dynamic number of stages and phases.

Here is an example showing how to use formatted strings(f-strings) in
:py:class:`~poptorch.ParallelPhasedExecution`.

In :numref:`parallel_phased_example`, there are several places where f-strings are
used:

* Line 25: ``f"phase{phase}_ipu{ipu}"``, where ``phase`` has the values
  0, 1, 1, 2, 3, 3, 4, 5, and 5, and ``ipu`` ranges from 0 to 1.
  The total number of instances for this f-string is 12, from
  6 phases and 2 IPUs.

* Line 32: ``f"phase{N*2-1}_ipu1"``,
  where ``phase`` is 5 and ``ipu`` is 1.

* Lines 46-47 and 50-51: when defining :py:class:`~poptorch.Stage`,
  four f-strings are used where ``n`` ranges from 0 to 2

  * ``f"phase_{2*n}_ipu0"``
  * ``f"phase{2*n}_ipu1"``
  * ``f"phase_{2*n+1}_ipu0"``
  * ``f"phase{2*n+1}_ipu1"``

  These refer to phases 0, 2, 4 and 1, 3, 5, with ``ipu0`` and ``ipu1``,
  respectively.
  So all these 12 f-strings are defined in :py:class:`~poptorch.BeginBlock`,
  and used in :py:class:`~poptorch.Stage` dynamically. These match exactly.

.. literalinclude:: phased_execution.py
  :caption: An example of parallel phased execution
  :language: python
  :linenos:
  :start-after: annotations_start
  :end-before: annotations_end
  :emphasize-lines: 23, 30, 45-46, 49-50
  :name: parallel_phased_example

With the above functions as building blocks, you can set execution strategies
using the four kinds of execution modes, as shown below.

.. _available_execution_strategies:

Available execution strategies
------------------------------

Note that you can use the same annotation for each execution strategy.
They only differ in the method of parallelisation and tensor locations.

.. _pipelined_execution:

Pipelined execution
^^^^^^^^^^^^^^^^^^^

:py:class:`~poptorch.PipelinedExecution` is the default execution strategy.

When running a model for inference with
:py:class:`~poptorch.PipelinedExecution`, you must set :py:meth:`~poptorch.Options.deviceIterations` to be greater than or equal to
the number of pipeline stages used by the model. You can also do this for
training to improve efficiency.

Each time you switch IPU, PopTorch adds a new pipeline stage.
If two consecutive blocks/stages use the same IPU, PopTorch will merge them
into a single block/stage.
It is usually better not to revisit an IPU, creating more than one pipeline
stage on the same IPU, because the IPU can not run both stages at the same time.
Hence in most cases, the number of pipeline stages for inference will be equal
to the number of IPUs you have used.

When training, PopTorch doubles the number of pipeline stages in order to run
backpropagation, except for the last forward stage which becomes a combined
forward and backward pipeline stage (:numref:`fig_poptorch_pipelining`).

.. _fig_poptorch_pipelining:
.. figure:: pipelined_execution.png
  :width: 95%

  PopTorch pipelined execution for training. The last forward stage is combined with the first backward stage.

You must set
:py:meth:`~poptorch.options._TrainingOptions.gradientAccumulation` to be greater
than or equal to the number of pipeline stages (forward and backward).
As well as these constraints, you must also consider that the number of batches
obtained each time you call the model will be multiplied (from the conventional
model batch size, known as the micro-batch size) by
:py:meth:`~poptorch.Options.deviceIterations` *
(:py:meth:`~poptorch.Options.replicationFactor` / ``input_group_size``) *
:py:meth:`~poptorch.options._TrainingOptions.gradientAccumulation` during
training and :py:meth:`~poptorch.Options.deviceIterations` *
(:py:meth:`~poptorch.Options.replicationFactor` / ``input_group_size``) during
inference (for details of ``input_group_size`` see
:py:meth:`~poptorch.Options.replicationFactor`). You can use
:py:class:`poptorch.DataLoader` to abstract this calculation but
you should still be aware that this will take place.

.. note:: The effective or conventional batch size for layers which depend on it
  (such as batch normalization) is known as the micro-batch size. If you use
  :py:class:`~poptorch.DataLoader`, the ``batch_size`` which you pass to it is
  the micro-batch size.

After each IPU has finished processing a micro-batch, the same IPU immediately
starts processing the next micro-batch while the next IPU processes the
micro-batch that the same IPU just processed. This creates a pipeline which
processes multiple micro-batches in parallel.

An IPU can only start its own stage of a micro-batch after the previous stage of
that micro-batch has been processed. Hence, not all IPUs will be occupied until
after a "ramp-up" period.

There is also a "ramp-down" period at the end of processing, during which there
are no new micro-batches entering the pipeline for the first IPU to process while
the IPUs down the pipeline still have micro-batches to process. Hence, during
this period, the number of IPUs occupied will reduce each step. For this reason,
you should try using a larger value for
:py:meth:`~poptorch.options._TrainingOptions.gradientAccumulation`. But you
should note that reducing the frequency of parameter updates will also have
an adverse effect on training.

Although you only define the :py:class:`~poptorch.Phase` for forward passes,
the corresponding phases for backward passes are also created.

.. _sharded_execution:

Sharded execution
^^^^^^^^^^^^^^^^^

In this strategy, each IPU
will sequentially execute a distinct part of the model.
A single unit of processing :py:class:`~poptorch.ShardedExecution` is called a
shard.

A shard is specified using :py:class:`~poptorch.Stage`,
or if no :py:class:`~poptorch.Stage` is specified,
the ``user_id`` passed by
:py:class:`~poptorch.BeginBlock` or :py:class:`~poptorch.Block` is used.
Each shard is executed sequentially on a single IPU (:numref:`fig_poptorch_sharded`).
You can place multiple shards on multiple IPUs.
However, only one IPU is used at a time, while
the other IPUs are idle.

.. _fig_poptorch_sharded:
.. figure:: sharded_execution.png
  :width: 95%

  PopTorch sharded execution for training.

If an IPU is allocated to run consecutive stages,
PopART will merge consecutive stages into one on the same IPU.
Weights and activations will use the on-chip memory of the IPUs.
You need to place layers that share weights on the same IPU.

:py:class:`~poptorch.ShardedExecution` can be useful
for processing a single sample or for debugging.
Overall, it has low efficiency because only one IPU is used at a time.


Phased execution
^^^^^^^^^^^^^^^^

:py:class:`~poptorch.ParallelPhasedExecution` and
:py:class:`~poptorch.SerialPhasedExecution` have the following
features in common:

* A portion of the weights and activations are transferred to and from
  Streaming Memory, before and after each phase.
* If the desired weights and activations are already stored in an IPU
  of the same group of IPUs,
  intra-phase cross-IPU copies can replace the copies
  to and from Streaming Memory.
* This specific portion is needed by the layers of the model wrapped in
  :py:class:`~poptorch.BeginBlock` or :py:class:`~poptorch.Block` in current
  :py:class:`~poptorch.Phase`.
* They both trade off some performance
  for larger models with higher memory needs.
* Any number of phases is allowed.
* The number of stages in each :py:class:`~poptorch.Phase`
  should match the number of IPUs in each group of IPUs.
* Stages inside each :py:class:`~poptorch.Phase` can run in parallel.

Although you only define the :py:class:`~poptorch.Phase` for forward passes,
the corresponding phases for backward passes are also created.
The order of phased execution for backward passes won't change
but you can decide whether a phase is shared by both
forward and backward passes. In other words, you decide whether to avoid
a memory transfer of a portion of the weights and activations.

Serial phased execution
"""""""""""""""""""""""

In :py:class:`~poptorch.SerialPhasedExecution`,
phases execute on a single group of IPUs sequentially.

.. literalinclude:: phased_execution.py
    :language: python
    :caption: How to use SerialPhasedExecution
    :linenos:
    :start-after: serial_start
    :end-before: serial_end

The code above causes all phases to run serially on IPUs 0 and 1.
(A,B and C on IPU 0, A2, B2, C2 on IPU 1).

Parallel phased execution
"""""""""""""""""""""""""

In :py:class:`~poptorch.ParallelPhasedExecution`,
phases are executed in parallel alternating between two groups of IPUs.
Even phases must run on even IPUs and odd phases on odd IPUs.
Inter-phase cross-IPU copies can replace the memory transfers to and from
the Streaming Memory, if the desired weights and activations are already
available in another group of IPUs.

.. literalinclude:: phased_execution.py
    :language: python
    :caption: How to use ParallelPhasedExecution
    :linenos:
    :start-after: parallel_start
    :end-before: parallel_end

In the code example above, there are three phases. Each phase has two stages
and each IPU group has two IPUs, so the number of groups matches the number
of IPUs. Even phases 0 and 2 run on IPU 0 and 2, while odd phase 1 runs on
IPU 1 and 3. This allows for faster cross-IPU copies, both
inter-phase and intra-phase.

poptorch.Liveness
"""""""""""""""""

:py:class:`~poptorch.Liveness` controls the availability of tensors on IPU,
and is only needed for
:py:class:`~poptorch.ParallelPhasedExecution`
and :py:class:`~poptorch.SerialPhasedExecution`.

The default :py:class:`~poptorch.Liveness` is ``AlwaysLive``.
``OffChipAfterFwd``, ``OffChipAfterFwdNoOverlap`` and
``OffChipAfterEachPhase`` may be helpful if you run a large model
with a tight memory budget.

.. _grouping_tensor_weights:

Grouping tensor weights across replicas
---------------------------------------

PopTorch supports configuring weight tensors such that a different value of the
weight tensor is sent to each replica, or to groups of replicas. This
functionality can be used, for instance, to split a weight tensor and process
parts of it on different groups of replicas. This functionality is accessed
using the :py:func:`~replicaGrouping` method on the weight tensor in question.

.. literalinclude:: replica_grouped_weights.py
    :language: python
    :caption: How to use replica grouped weights
    :linenos:
    :start-after: groupedweights_start
    :end-before: groupedweights_end

In the code example above, eight replicas are used. The weight tensor ``W`` is
split four ways between orthogonal groups, each containing two replicas.
Orthogonal groups are organised perpendicularly to the replica ordering, so that
in this example replicas 0 and 4 would form the first group, 1 and 5 the
second, and so on. See :py:class:`~poptorch.CommGroupType` for other replica
group organisation options (also illustrated in :numref:`figCommGroupTypes`),
and :py:class:`~poptorch.VariableRetrievalMode` for options relating to how
many replicas will be involved in value retrieval.

.. figure:: comm-group-types.png
  :name: figCommGroupTypes
  :width: 100%

  Possible CommGroupTypes

Note that in this code example, the input tensor ``X`` is split two ways. This
is achieved using :py:meth:`~poptorch.Options.inputReplicaGrouping`.

.. _optimizers:

Optimizers
==========

PopTorch supports the following optimizers:

#. :py:class:`~poptorch.optim.SGD`
#. :py:class:`~poptorch.optim.Adam`
#. :py:class:`~poptorch.optim.AdamW`
#. :py:class:`~poptorch.optim.RMSprop`
#. :py:class:`~poptorch.optim.LAMB`

In addition, PopTorch has features to support ``float16`` models, such as loss scaling, velocity scaling, bias correction and accumulator types.

.. important:: All of these extra attributes (except ``velocity_scaling``) must have the same values for different ``param_groups`` and therefore you must set them at the optimizer level.

.. literalinclude:: api.py
    :language: python
    :caption: How to update values in an Optimizer
    :linenos:
    :start-after: optim_start
    :end-before: optim_end
    :emphasize-lines: 5-9

.. important:: You must call :py:func:`~poptorch.PoplarExecutor.setOptimizer` to apply the new optimizer values to the model.

.. warning:: PopTorch does not directly use the Python implementation of the optimizers. Built-in implementations are used in their place.
   This means that you cannot currently use custom optimizers. Subclassing a built-in optimizer will generate a warning. Any custom behaviour
   in a custom optimizer is unlikely to take effect, other than simply setting the existing attributes.

Loss scaling
------------

When training models which use ``half`` or ``float16`` values, you can use loss scaling  to prevent the gradients from becoming too small and causing underflows.

Before calculating the gradients, PopTorch will scale the loss by the value of the ``loss_scaling`` parameter.
PopTorch will multiply the gradients by the inverse scale prior to updating the optimizer state.
Therefore, beyond improving numerical stability, neither the training nor the hyper-parameters are affected.

Higher ``loss_scaling`` values can improve numerical stability by minimising underflow.
However, too high a value can result in overflow.
The optimal loss scaling factor depends on the model.

You can either set the ``loss_scaling`` factors manually, or you can set :py:func:`~poptorch.options._TrainingOptions.setAutomaticLossScaling` in :py:class:`opts.Training <poptorch.options._TrainingOptions>`,
which will automatically set a global loss scaling factor. If you both set ``loss_scaling`` manually and enable automatic loss scaling, the manually
set factor(s) will be used initially and updated automatically during training.

.. warning:: Automatic loss scaling is a preview feature. It is well tested and enabled in some of our example applications, but may not behave as expected in all models. Recommendation: if your model with automatic loss scaling enabled does not converge or triggers a compilation error, then you will need to set the loss scale manually.

Velocity scaling (SGD combined variant only)
--------------------------------------------

The SGD optimizer, when used with momentum, updates weights based
on the velocity values.
The combined variant uses one tensor per parameter to store the
velocity and the changes to the velocity from accumulated gradients.
Unlike the separate variant, therefore, each gradient accumulation step involves
adding or subtracting values of a different magnitude to the gradients (for
which loss scaling is used). You can therefore use the ``velocity_scaling`` parameter to scale the combined velocity tensor to improve numerical precision when using ``half``/``float16`` values.
(Note that the gradients are, in effect, scaled by ``velocity_scaling/loss_scaling`` so the ``loss_scaling`` has no impact on the effective scaling of velocity parameters.)

As with loss scaling, higher values can minimise underflow of the velocity values but may result in overflow.

Accumulation types
------------------

In order to improve numerical stability some of the optimizers (LAMB, Adam, AdamW, RMSprop) give you the option
to tweak the data type used by the optimizer's accumulators.

``accum_type`` lets you choose the type used for gradient accumulation.
``first_order_momentum_accum_type`` / ``second_order_momentum_accum_type`` give you control over the type used to store the first-order and second-order momentum optimizer states.

Constant attributes
-------------------

In order to improve performance and / or save memory PopTorch will try to embed directly in the program the attributes which are constant.

.. important:: Trying to modify a constant attribute after the model has been compiled will result in an error.

For PopTorch optimizers (those from the ``poptorch.optim`` namespace) by default the attributes explicitly passed to the optimizer's constructor will be considered variables and the others will be considered as constant.

You can override this behaviour using :py:func:`~poptorch.optim.VariableAttributes.markAsConstant` and :py:func:`~poptorch.optim.VariableAttributes.markAsVariable` before compiling the model.

.. literalinclude:: api.py
    :language: python
    :caption: Constant and variable attributes for PopTorch optimizers
    :linenos:
    :start-after: optim_const_start
    :end-before: optim_const_end

For native optimizers (those from the `torch.optim <https://pytorch.org/docs/1.10.0/optim.html>`__ namespace) the attributes which are left to their default value in the constructor will be considered to be constant.

There is no method to override this behaviour which is why we recommend you always use the ``poptorch.optim`` optimizers instead.

.. literalinclude:: api.py
    :language: python
    :caption: Constant and variable attributes for Torch optimizers
    :linenos:
    :start-after: torch_optim_const_start
    :end-before: torch_optim_const_end

.. note:: There is an exception: ``lr`` is always marked as variable.

Reading and writing optimizer state
-----------------------------------

When you use a ``poptorch.optim`` optimizer with a :py:func:`~poptorch.trainingModel`, you can use the optimizer's ``state_dict()`` and ``load_state_dict()`` functions to read/write optimizer state to/from the IPU.
This can be used to restart training from a checkpoint saved previously.

.. literalinclude:: api.py
  :caption: Reading and writing optimiser state
  :start-after: optim_state_dict_start
  :end-before: optim_state_dict_end
  :emphasize-lines: 6,11

.. note:: The structure of the state dictionary, as well as the keys within, will differ from those in PyTorch. As such, you cannot load a state dictionary with PopTorch that was obtained by running native PyTorch.

PopTorch ops
============

This section describes some "helper" operations you can use within a model.

poptorch.ctc_beam_search_decoder
--------------------------------

This function adds a Connectionist Temporal Classification (CTC) beam search
decoder operator to the model.

.. literalinclude:: api.py
    :language: python
    :linenos:
    :start-after: ctc_beam_search_start
    :end-before: ctc_beam_search_end
    :emphasize-lines: 3

For more information see: :py:func:`~poptorch.ctc_beam_search_decoder`.

poptorch.ipu_print_tensor
-------------------------

This function adds an op to print the content of a tensor on the IPU.

.. note::
   To prevent the print operation being optimised out by the graph
   optimiser, you must use the return value of ``ipu_print_tensor()``.

.. literalinclude:: api.py
    :language: python
    :linenos:
    :start-after: print_tensor_start
    :end-before: print_tensor_end
    :emphasize-lines: 10

For more information see: :py:func:`~poptorch.ipu_print_tensor`.

poptorch.identity_loss
----------------------

You can use this function to implement custom losses. It takes a single PyTorch tensor
and will backpropagate a gradient of ones through it.

.. literalinclude:: api.py
  :language: python
  :linenos:
  :start-after: identity_start
  :end-before: identity_end
  :emphasize-lines: 5
  :caption: Example of custom loss.

For more information see: :py:func:`~poptorch.identity_loss`.

poptorch.MultiConv
------------------

Use the :py:class:`~poptorch.MultiConv` wrapper class to define multi-convolutions.

Refer to the `PopLibs documentation for multi-convolutions <https://docs.graphcore.ai/projects/poplar-api/en/latest/poplibs_api.html>`__ for further information.

For more information see: :py:class:`~poptorch.MultiConv` and :py:class:`~poptorch.MultiConvPlanType`.

poptorch.nop
------------

PopTorch includes a "no-op" function for debugging purposes.

For more information see: :py:func:`~poptorch.nop`.

poptorch.dynamic_slice
----------------------

Standard PyTorch slicing syntax cannot currently be used to create dynamic slices.
This function supports dynamic slicing on the IPU.

For more information see: :py:func:`~poptorch.dynamic_slice`.

poptorch.dynamic_update
-----------------------

Standard PyTorch slicing syntax cannot currently be used to dynamically update a slice
of a tensor. `poptorch.dynamic_update` allows updating a tensor with a statically sized
slice at a dynamic index.
This function supports dynamic updates on the IPU.

For more information see: :py:func:`~poptorch.dynamic_update`.

poptorch.serializedMatMul
-------------------------

Use this function to create a serialized matrix multiplication, which splits
a larger matrix multiplication into smaller matrix multiplications to reduce
memory requirements.

For more information see: :py:func:`~poptorch.serializedMatMul`.


poptorch.set_available_memory
-----------------------------

Use this function to override the default proportion of tile memory available as
temporary memory for use by operations such as a convolution or matrix
multiplication.  The operators that can be tuned with this setting include:

* convolution
* matrix multiplication
* embedding lookup
* indexing operations

For more information see:

* :py:func:`~poptorch.set_available_memory`
* `technical note <https://docs.graphcore.ai/projects/available-memory/en/latest/>`_ on optimising temporary memory usage

Miscellaneous functions
-----------------------

The following PopTorch functions, not related to model creation, are available:

- :py:func:`~poptorch.ipuHardwareIsAvailable`
- :py:func:`~poptorch.ipuHardwareVersion`
- :py:func:`~poptorch.setLogLevel`


16-bit float support
====================

PopTorch supports the half-precision floating point (``float16``) format.
You can simply input ``float16`` tensors into your model.
(You can convert a tensor to ``float16`` using ``tensor = tensor.half()``)

You can use your models in one of the following ways:

#. Convert all parameters (weights) to ``float16`` by using a ``Module``'s ``.half()`` method. This is the most memory efficient, however small updates to weights may be lost, hindering training.
#. Keep the parameters (weights) as ``float32``, in which case the parameter updates will occur using ``float32``. However, the parameters will be converted to ``float16`` if you call an operation with a ``float16`` input. This is more memory efficient than using ``float32`` tensors (inputs) but less memory efficient than using ``float16`` weights.
#. Use a mix of ``float32`` and ``float16`` parameters by manually specifying parameters as ``float16`` or ``float32``.

.. note::  When PyTorch encounters a mix of ``float16`` and ``float32`` inputs for a given operation, it will usually cast all inputs to ``float32``,
    and PopTorch complies with this convention.

.. literalinclude:: inferenceModel.py
    :language: python
    :caption: How to run a model using half precision
    :linenos:
    :start-after: inference_half_start
    :end-before: inference_half_end
    :emphasize-lines: 1, 2


PyTorch buffers
===============

PopTorch supports PyTorch buffers in some circumstances.
You can use buffers to make tensors persistent,
that is to allow tensors to keep their values from the previous run on each new run,
without making them model parameters.
However, you must make sure that you only make in-place modifications to the
buffer using PyTorch in-place operations (such as `+=` or those ending in  `_`).
For example, you can  ``torch.Tensor.copy_`` to copy the contents of another
tensor to the buffer.

Unlike when running on the CPU, the following PyTorch code does not increment
``model.i`` each time, when running on the IPU:

.. literalinclude:: buffers.py
    :language: python
    :caption: The wrong way to have a persistent tensor
    :linenos:
    :start-after: counter_model_wrong_start
    :end-before: counter_model_wrong_end

This is because the PyTorch dispatcher will capture the value for ``model.i`` when
building the graph and freeze the value as a constant.

You can keep the value of a tensor between runs by registering it as a buffer
in PyTorch, as the following examples shows:

.. literalinclude:: buffers.py
    :language: python
    :caption: An example showing a tensor which is incremented on each iteration by registering it as a tensor.
    :linenos:
    :start-after: counter_model_correct_start
    :end-before: counter_model_correct_end

.. note:: When running an inference model
  (with :py:func:`~poptorch.inferenceModel`), any buffers which your model
  modifies will not be implicitly copied to the host. You will need to call
  :py:func:`~poptorch.PoplarExecutor.copyWeightsToHost` before reading the value
  of a buffer which has been changed as a result of a model call.

.. note:: PopTorch does not support broadcasting of buffers between replicas.
  You can make each replica use its own buffer by setting the PopTorch option
  :py:func:`~poptorch.Options.broadcastBuffers` to False:
  ``poptorch.Options().broadcastBuffers(False)``

  You need to ensure that your model still works with each replica using a
  separate buffer.


.. _creating_custom_ops:

Creating custom ops
===================

If you need to implement functionality that is not directly
supported in in PopTorch, you can create a custom op.

There are two steps to creating a custom op in PopTorch:

#. Implement the op in C++ using the PopART API
#. Make the op available in PopTorch so you can use it in your PyTorch model


Implementing the custom op
--------------------------

You will need to implement the new op as C++ code by creating subclasses of, at
least, the Op and Opx base classes provided by the PopART API.

If you are going to use the custom op for training, then you will also need to
define the classes that implement the gradient operation. For details of how to
do this, see the `Custom operators
<https://docs.graphcore.ai/projects/popart-user-guide/en/latest/custom_ops.html>`__
chapter of the PopART User Guide.

You can find some examples of custom ops in the :tutorials-repo:`Graphcore GitHub examples repository <feature_examples/pytorch/custom_op/>`.

Compiling the PopART custom op will create a dynamic library file, which you can
use with your PyTorch code.

Make the op available to PyTorch
--------------------------------

After you have compiled the C++ implementation of the custom op, you can load
the library file, and call the op from your PyTorch program, using the
:py:class:`~poptorch.custom_op` class.

First, load the dynamic library as shown in :numref:`loading_library_code`.

.. literalinclude:: ../../tests/custom_ops_test.py
    :language: python
    :caption: Loading the library for the custom op
    :linenos:
    :start-after: loading_library_start
    :end-before: loading_library_end
    :name: loading_library_code

You can now call your custom op using the PopTorch class
:py:class:`~poptorch.custom_op`.

Both the forward op and backward op are implemented in the PopART code.
However, in this inference model example, only the forward op is called:

.. literalinclude:: ../../tests/custom_ops_test.py
    :language: python
    :caption: Calling a custom op in a PopTorch inference model
    :linenos:
    :emphasize-lines: 4-8
    :start-after: inference_start
    :end-before: inference_end

In this example ``[x, x]`` is assigned to ``example_outputs``, where ``x``
is one of the input tensors which is used as a template for the output tensors.
The custom op code will need to create the tensors that it returns.

You can also call this custom op inside a training model using
:py:class:`~poptorch.custom_op` and the backward op will be called automatically.

The Graphcore examples repository contains a feature example demonstrating
how to load and in and use a custom op in a PopTorch model: :tutorials-repo:`PopTorch example: Custom op <feature_examples/pytorch/custom_op>`.

Passing attributes to the custom op
-----------------------------------

You can pass attributes to the custom op using a Python dictionary, as shown in
:numref:`inference_with_attribute_code`.

.. literalinclude:: ../../tests/custom_ops_test.py
    :language: python
    :caption: Passing an attribute to a custom op from PopTorch
    :linenos:
    :emphasize-lines: 8
    :start-after: inference_with_attribute_start
    :end-before: inference_with_attribute_end
    :name: inference_with_attribute_code

You can then access these attributes within the C++ custom op code. The above
example passes a ``Float`` attribute with the name ``alpha`` to the LeakyRELU
implementation. See the `Custom operators
<https://docs.graphcore.ai/projects/popart-user-guide/en/latest/custom_ops.html>`__
chapter of the PopART User Guide for more information.

Table :numref:`popart-attribute-types` and the code example in
:numref:`many_attribtes_examples_code` show how to pass other attribute types
to a custom op. PopTorch supports all attributes supported in PopART except for
``Graph``.

.. list-table:: Python types to use to pass attributes to PopART
   :widths: 35 65
   :header-rows: 1
   :align: center
   :name: popart-attribute-types

   * - PopART attribute type
     - Python equivalent
   * - ``Float``
     - Python float (converted to ``float32``)
   * - ``Floats``
     - List or tuple of Python float
   * - ``Int``
     - Python int (converted to 64-bit signed int)
   * - ``Ints``
     - List or tuple of Python int
   * - ``String``
     - Python str (converted to ASCII)
   * - ``Strings``
     - List or tuple of Python str
   * - ``Graph``
     - Not supported

.. literalinclude:: ../../tests/custom_ops_attributes_test.py
    :language: python
    :caption: Passing different attribute types from PopTorch
    :linenos:
    :start-after: many_attribtes_examples_start
    :end-before: many_attribtes_examples_end
    :name: many_attribtes_examples_code


Precompilation and caching
==========================

.. _caching:

Caching
-------

By default PopTorch will re-compile the model every time you instantiate a model.
However if you often run the same models you might want to enable executable caching to save time.

You can do this by either setting the ``POPTORCH_CACHE_DIR`` environment variable or by calling :py:class:`~poptorch.Options.enableExecutableCaching`.

.. warning:: The cache directory might grow large quickly because PopTorch doesn't delete old models from the cache and, depending on the number and size of your models and the number of IPUs used, the executables might be quite large. It is your responsibility to delete the unwanted cache files.

Precompilation
--------------

PopTorch supports precompilation: This means you can compile your model on a machine which doesn't have an IPU
and export the executable to a file. You can then reload and execute it on a different machine which does have an IPU.

.. important:: The PopTorch versions on both machines must be an exact match.

To precompile your model you need to wrap it using either :py:func:`~poptorch.trainingModel` or :py:func:`~poptorch.inferenceModel` then call :py:meth:`~poptorch.PoplarExecutor.compileAndExport` on the wrapper.

.. literalinclude:: precompilation.py
    :language: python
    :caption: How to precompile a model using an offline IPU target.
    :linenos:
    :start-after: precomp_start
    :end-before: precomp_end
    :emphasize-lines: 22-23,32

.. note:: If you don't know the IPU version on your system you can use :py:func:`~poptorch.ipuHardwareVersion`.

The exported file by default will contain your original PyTorch model (including the weights), and enough information to re-create the PopTorch wrapper and reload the executable.

.. important:: For your model and weights to be exported, your model must be picklable. See https://docs.python.org/3/library/pickle.html for more information.
  If your model is not picklable then use ``export_model=False``, as shown in :numref:`export_no_python`.

Now both the torch model, PopTorch wrapper and executable can be restored on the target machine using :py:func:`~poptorch.load`:

.. literalinclude:: precompilation.py
    :language: python
    :caption: How to load a precompiled model
    :linenos:
    :start-after: load_start
    :end-before: load_end
    :emphasize-lines: 1

In some cases you might want to provide some runtime information to select the device: you can do this
using the ``edit_opts_fn`` argument of :py:func:`~poptorch.load`:

.. literalinclude:: precompilation.py
    :language: python
    :caption: How to load a precompiled model and run on a specific IPU
    :linenos:
    :start-after: load_setIpu_start
    :end-before: load_setIpu_end
    :emphasize-lines: 1-2,5

.. note:: When loading a precompiled model, only run-time options will be applied; all others will be ignored.

Going back to the precompilation step: in some cases you might want to export only the
executable and not the python wrapper or torch model (for example if your model cannot be pickled).

.. literalinclude:: precompilation.py
    :language: python
    :caption: How to export only the executable
    :linenos:
    :start-after: precomp_no_python_start
    :end-before: precomp_no_python_end
    :name: export_no_python

It means you will need to re-create and wrap the model yourself before loading the executable:

.. literalinclude:: precompilation.py
    :language: python
    :caption: How to load a precompiled executable
    :linenos:
    :start-after: load_exe_start
    :end-before: load_exe_end
    :emphasize-lines: 1,6-7

.. important:: Exported models lose their connections to other models.

  For example, if you have a :py:func:`~poptorch.trainingModel` and a :py:func:`~poptorch.inferenceModel` based
  on the same PyTorch model, you wouldn't usually need to keep the weights synchronised between the two;
  PopTorch would take care of it for you, implicitly.

In the following example, PopTorch automatically copies the weights from the training model to the inference model:

.. literalinclude:: precompilation.py
    :language: python
    :caption: PopTorch implicit copies
    :linenos:
    :start-after: implicit_cp_start
    :end-before: implicit_cp_end
    :emphasize-lines: 16,18-20

If you were to export these models:

.. literalinclude:: precompilation.py
    :language: python
    :caption: Precompilation of both a training and validation models
    :linenos:
    :start-after: precomp_train_val_start
    :end-before: precomp_train_val_end
    :emphasize-lines: 11-12,14

.. note:: Don't forget to call ``model.eval()`` or ``model.train()``, as required, before calling :py:func:`~poptorch.PoplarExecutor.compileAndExport`.

You could then insert explicit copy operations:

.. literalinclude:: precompilation.py
    :language: python
    :caption: Precompilation of both a training and validation models
    :linenos:
    :start-after: load_train_val_start
    :end-before: load_train_val_end
    :emphasize-lines: 9,10

Or you would need to re-connect the two models by creating the second one from the first one
and then loading the executable:

.. literalinclude:: precompilation.py
    :language: python
    :caption: Precompilation of both a training and validation models
    :linenos:
    :start-after: load_train_val_connected_start
    :end-before: load_train_val_connected_end
    :emphasize-lines: 2-6


Environment variables
=====================

Logging level
-------------
PopTorch uses the following levels of logging:

  * ``OFF``: No logging
  * ``ERR``: Errors only
  * ``WARN``: Warnings and errors only
  * ``INFO``: Info, warnings and errors (default)
  * ``DEBUG``: Adds some extra debugging information
  * ``TRACE`` and ``TRACE_ALL``: Trace everything inside PopTorch

You can use the ``POPTORCH_LOG_LEVEL`` environment variable to set the logging level:

.. code-block:: bash

  export POPTORCH_LOG_LEVEL=DEBUG

.. _profiling_env:

Profiling
---------

When running programs using PopTorch, you can enable profiling by using the ``POPLAR_ENGINE_OPTIONS`` environment variable used by Poplar.

In order to capture the reports needed for the PopVision Graph Analyser you only need to set ``POPLAR_ENGINE_OPTIONS='{"autoReport.all":"true"}'``:

.. code-block:: bash

  export POPLAR_ENGINE_OPTIONS='{"autoReport.all":"true"}'

By default, report files are output to the current working directory. You can specify a different output directory by setting ``autoReport.directory``, for example:

.. code-block:: bash

  export POPLAR_ENGINE_OPTIONS='{"autoReport.all":"true", "autoReport.directory":"./tommyFlowers"}'

For more options, refer to the `PopVision Graph Analyser User Guide <https://docs.graphcore.ai/projects/graph-analyser-userguide/en/latest/index.html>`__.

In order to capture the ``pvti`` reports needed for the `PopVision System Analyser <https://docs.graphcore.ai/projects/system-analyser-userguide/en/latest/index.html>`__
you need to enable the `PopVision Trace Instrumentation library (PVTI) <https://docs.graphcore.ai/projects/libpvti/en/latest/index.html>`__.
To do so, set ``PVTI_OPTIONS='{"enable":"true"}'``.

.. important:: By default, PopVision will display multiple trace files using relative time. This is because most of the time we want to compare two executions of the same model, for example. However, in this case we want the traces to be aligned on absolute time: this can be done by selecting "Absolute Timing" in the PopVision options.

You can also add extra tracepoints in your own code by using :py:class:`~poptorch.profiling.Channel`.

IPU Model
---------

By default PopTorch will try to attach to a physical IPU.
If instead you want to use the model, you can do so by setting ``POPTORCH_IPU_MODEL`` to 1:

.. code-block:: bash

  export POPTORCH_IPU_MODEL=1

See the `Poplar and PopLibs User Guide <https://docs.graphcore.ai/projects/poplar-user-guide>`__ for the limitations of the IPU Model.

Wait for an IPU to become available
-----------------------------------

By default, attempting to attach to an IPU when all IPUs are
already in use will raise an exception.
If you would rather wait for an IPU to become available, you can do so by setting ``POPTORCH_WAIT_FOR_IPU`` to 1.

.. code-block:: bash

  export POPTORCH_WAIT_FOR_IPU=1

Enable executable caching
-------------------------

You can enable executable caching by either setting the ``POPTORCH_CACHE_DIR`` environment variable or by calling :py:class:`~poptorch.Options.enableExecutableCaching`.

.. code-block:: bash

  export POPTORCH_CACHE_DIR=/tmp/poptorch_cache

For more information, see :numref:`caching`.


================================================
FILE: docs/user_guide/phased_execution.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import torch
import torch.nn.functional as F
import poptorch

# pylint: disable=function-redefined, too-many-function-args
# annotations_start
poptorch.setLogLevel("DEBUG")  # Force debug logging
N = 3
size = 10


class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.weights = torch.nn.ParameterList([
            torch.nn.Parameter(torch.rand(size, size), requires_grad=True)
            for n in range(N * 6)
        ])

    def forward(self, in0, target=None):
        phase = 0
        weight = iter(self.weights)
        with poptorch.Block("phase0_ipu0"):
            ins = torch.split(in0, size)
        for n in range(N * 3):
            out = []
            for ipu in range(2):
                x = ins[ipu]
                with poptorch.Block(f"phase{phase}_ipu{ipu}"):
                    x = torch.matmul(next(weight), x)
                    out.append(F.relu(x))
            ins = out[1], out[0]
            # We want 2 matmuls in the same phase
            if n % 3 != 1:
                phase += 1
        with poptorch.Block(f"phase{N*2-1}_ipu1"):
            res = ins[0] + ins[1]
            if target is None:
                return res
            return res, torch.nn.L1Loss(reduction="mean")(res, target)


input = torch.rand(size * 2, 1)
target = torch.rand(size, 1)
model = Model()
opts = poptorch.Options()
phases = []
# Alternate between 0-2 and 1-3
for n in range(N):
    phases.append([
        poptorch.Stage(f"phase{2*n}_ipu0").ipu(0),
        poptorch.Stage(f"phase{2*n}_ipu1").ipu(2)
    ])
    phases.append([
        poptorch.Stage(f"phase{2*n+1}_ipu0").ipu(1),
        poptorch.Stage(f"phase{2*n+1}_ipu1").ipu(3)
    ])
opts.setExecutionStrategy(poptorch.ParallelPhasedExecution(*phases))
poptorch_model = poptorch.trainingModel(model, opts)
poptorch_model.compile(input, target)

# annotations_end


# stage_start
class Model(torch.nn.Module):
    def forward(self, x, y):
        with poptorch.Block("A"):
            c = x + x
        with poptorch.Block("B"):
            d = y + y
        with poptorch.Block("C"):
            e = x * 3

        return c, d, e


first = poptorch.Phase(poptorch.Stage("A").ipu(0))
# Regrouped in a single stage
second = poptorch.Phase(poptorch.Stage("B", "C").ipu(1))
# 2 separate stages
second = poptorch.Phase(poptorch.Stage("B").ipu(1), poptorch.Stage("C").ipu(3))
# stage_end

opts = poptorch.Options()
opts.autoRoundNumIPUs(True)
opts.setExecutionStrategy(poptorch.ParallelPhasedExecution(first, second))
m = poptorch.inferenceModel(Model(), opts)
m.compile(input, input)
m.destroy()


class Model(torch.nn.Module):
    def forward(self, x, y):
        with poptorch.Block("A"):
            c = x + x
        with poptorch.Block("A2"):
            d = y + y

        with poptorch.Block("B"):
            e = c + d
        with poptorch.Block("B2"):
            f = y + d

        with poptorch.Block("C"):
            g = e + f
        with poptorch.Block("C2"):
            h = f + y
        return g, h


opts = poptorch.Options()
# serial_start
strategy = poptorch.SerialPhasedExecution(
    poptorch.Phase(poptorch.Stage("A"), poptorch.Stage("A2")),
    poptorch.Phase(poptorch.Stage("B"), poptorch.Stage("B2")),
    poptorch.Phase(poptorch.Stage("C"), poptorch.Stage("C2")))

strategy.phase(0).ipus(0, 1)
strategy.phase(1).ipus(0, 1)
strategy.phase(2).ipus(0, 1)

opts.setExecutionStrategy(strategy)
# serial_end

m = poptorch.inferenceModel(Model(), opts)
m.compile(input, input)
m.destroy()


class Model(torch.nn.Module):
    def forward(self, x, y):
        poptorch.Block.useAutoId()
        with poptorch.Block():
            c = x + x
        with poptorch.Block():
            d = y + y

        with poptorch.Block():
            e = c + d
        with poptorch.Block():
            f = y + d

        with poptorch.Block():
            g = e + f
        with poptorch.Block():
            h = f + y
        return g, h


opts = poptorch.Options()
# parallel_start
strategy = poptorch.ParallelPhasedExecution(
    poptorch.Phase(poptorch.Stage("0"), poptorch.Stage("1")),
    poptorch.Phase(poptorch.Stage("2"), poptorch.Stage("3")),
    poptorch.Phase(poptorch.Stage("4"), poptorch.Stage("5")))

strategy.phase(0).ipus(0, 2)
strategy.phase(1).ipus(1, 3)
strategy.phase(2).ipus(0, 2)

opts.setExecutionStrategy(strategy)
# parallel_end
m = poptorch.inferenceModel(Model(), opts)
m.compile(input, input)
m.destroy()


================================================
FILE: docs/user_guide/pipeline_simple.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
# annotations_start
import transformers
import torch
import poptorch

# A bert model from hugging face. See the packaged BERT example for actual usage.
pretrained_weights = 'mrm8488/bert-medium-finetuned-squadv2'


# For later versions of transformers, we need to wrap the model and set
# return_dict to False
class WrappedModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.wrapped = transformers.BertForQuestionAnswering.from_pretrained(
            pretrained_weights)

    def forward(self, input_ids, attention_mask, token_type_ids):
        return self.wrapped.forward(input_ids,
                                    attention_mask,
                                    token_type_ids,
                                    return_dict=False)

    def __getattr__(self, attr):
        try:
            return torch.nn.Module.__getattr__(self, attr)
        except AttributeError:
            return getattr(self.wrapped, attr)


model = WrappedModel()

# A handy way of seeing the names of all the layers in the network.
print(model)

# All layers before "model.bert.encoder.layer[0]" will be on IPU 0 and all layers from
# "model.bert.encoder.layer[0]" onwards (inclusive) will be on IPU 1.
model.bert.encoder.layer[0] = poptorch.BeginBlock(model.bert.encoder.layer[0],
                                                  ipu_id=1)

# Now all layers before layer are on IPU 1 and this layer onward is on IPU 2
model.bert.encoder.layer[2] = poptorch.BeginBlock(model.bert.encoder.layer[2],
                                                  ipu_id=2)

# Finally all layers from this layer till the end of the network are on IPU 3.
model.bert.encoder.layer[4] = poptorch.BeginBlock(model.bert.encoder.layer[4],
                                                  ipu_id=3)

# We must batch the data by at least the number of IPUs. Each IPU will still execute
# whatever the model batch size is.
data_batch_size = 4

# Create a poptorch.Options instance to override default options
opts = poptorch.Options()
opts.deviceIterations(data_batch_size)
# annotations_end

# Model is now passed to the wrapper as usual.
inference_model = poptorch.inferenceModel(model, opts)

tokenizer = transformers.BertTokenizer.from_pretrained(
    "mrm8488/bert-medium-finetuned-squadv2", return_token_type_ids=True)

# Make use of the model
contexts = [
    """Edinburgh is Scotland's compact, hilly capital.""",
    """The oldest cat recorded was Cream Puff at 38 years.""",
    """The largest litter of kittens produced 19 kittens.""",
    """The first webcam was used to check the status of a coffee pot."""
]
questions = [
    "What is the capital of Scotland?", "How old was the oldest cat ever?",
    "How many kittens in the largest litter?",
    "What was the first webcam used for?"
]

encoding = tokenizer(questions, contexts, padding=True)

input_ids = encoding["input_ids"]

start_scores, end_scores = inference_model(
    torch.tensor(encoding["input_ids"]),
    torch.tensor(encoding["attention_mask"]),
    torch.tensor(encoding["token_type_ids"]))

answer_string = []
for batch_id in range(len(contexts)):
    ans_tokens = input_ids[batch_id][torch.argmax(start_scores[batch_id]):torch
                                     .argmax(end_scores[batch_id]) + 1]

    answer_tokens = tokenizer.convert_ids_to_tokens(ans_tokens)
    answer_tokens_to_string = tokenizer.convert_tokens_to_string(answer_tokens)
    answer_string.append(answer_tokens_to_string)

print(answer_string)
assert answer_string[0] == 'edinburgh'
assert answer_string[1] == '38 years'
assert answer_string[2] == '19'
assert answer_string[3] == 'to check the status of a coffee pot'


# annotations_inline_start
class Network(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = torch.nn.Linear(5, 10)
        self.layer2 = torch.nn.Linear(10, 5)
        self.layer3 = torch.nn.Linear(5, 5)
        self.layer4 = torch.nn.Linear(5, 5)

        self.act = torch.nn.ReLU()
        self.softmax = torch.nn.Softmax(dim=1)

    def forward(self, x):

        # Explicit layers on a certain IPU
        poptorch.Block.useAutoId()
        with poptorch.Block(ipu_id=0):
            x = self.act(self.layer1(x))

        with poptorch.Block(ipu_id=1):
            x = self.act(self.layer2(x))

        with poptorch.Block(ipu_id=2):
            x = self.act(self.layer3(x))
            x = self.act(self.layer4(x))

        with poptorch.Block(ipu_id=3):
            x = self.softmax(x)
        return x


model = Network()
opts = poptorch.Options()
opts.deviceIterations(4)
poptorch_model = poptorch.inferenceModel(model, options=opts)
print(poptorch_model(torch.rand((4, 5))))

# annotations_inline_end


# pylint: disable=function-redefined
# annotations_decorator_start
class Network(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = torch.nn.Linear(5, 10)
        self.layer2 = torch.nn.Linear(10, 5)
        self.layer3 = torch.nn.Linear(5, 5)
        self.layer4 = torch.nn.Linear(5, 5)

        self.act = torch.nn.ReLU()
        self.softmax = torch.nn.Softmax(dim=1)

    def forward(self, x):
        poptorch.Block.useAutoId()
        x = self.block_one(x)
        x = self.block_two(x)
        x = self.final_activation(x)
        return x

    @poptorch.BlockFunction(ipu_id=0)
    def block_one(self, x):
        x = self.act(self.layer1(x))
        x = self.act(self.layer2(x))
        return x

    @poptorch.BlockFunction(ipu_id=1)
    def block_two(self, x):
        x = self.act(self.layer3(x))
        x = self.act(self.layer4(x))
        return x

    @poptorch.BlockFunction(ipu_id=1)
    def final_activation(self, x):
        return self.softmax(x)


model = Network()
opts = poptorch.Options()
opts.deviceIterations(4)
poptorch_model = poptorch.inferenceModel(model, options=opts)
print(poptorch_model(torch.rand((4, 5))))
# annotations_decorator_end


================================================
FILE: docs/user_guide/poptorch.conf
================================================
deviceIterations(1)
setExecutionStrategy(poptorch.ShardedExecution())
replicationFactor(1)
enableSyntheticData(True)


================================================
FILE: docs/user_guide/poptorch_training_simple.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import torch
import poptorch


class ExampleModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.bias = torch.nn.Parameter(torch.zeros(()))

    def forward(self, x):
        return torch.cat([
            100 * torch.nn.LeakyReLU()(-x + self.bias),
            100 * torch.nn.LeakyReLU()(x - self.bias)
        ],
                         dim=-1)


# model_with_loss_start
class ExampleModelWithLoss(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.model = ExampleModel()

    def forward(self, input, target):
        out = self.model(input)

        return (torch.nn.functional.softmax(out),
                torch.nn.CrossEntropyLoss(reduction="mean")(out, target))


# model_with_loss_end


class ExampleDataset(torch.utils.data.Dataset):
    def __init__(self, shape, length):
        super().__init__()
        self._shape = shape
        self._length = length

        self._all_data = []
        self._all_labels = []

        torch.manual_seed(0)
        for _ in range(length):
            label = 1 if torch.rand(()) > 0.5 else 0
            data = (torch.rand(self._shape) + label) * 0.5
            self._all_data.append(data)
            self._all_labels.append(label)

    def __len__(self):
        return self._length

    def __getitem__(self, index):
        return self._all_data[index], self._all_labels[index]


def run_examples():
    # simple_ipu_start
    # Set up the PyTorch DataLoader to load that much data at each iteration
    opts = poptorch.Options()
    opts.deviceIterations(10)
    training_data = poptorch.DataLoader(options=opts,
                                        dataset=ExampleDataset(shape=[1],
                                                               length=20000),
                                        batch_size=10,
                                        shuffle=True,
                                        drop_last=True)

    model = ExampleModelWithLoss()
    model.train()

    optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

    # Wrap the model in a PopTorch training wrapper
    poptorch_model = poptorch.trainingModel(model,
                                            options=opts,
                                            optimizer=optimizer)

    momentum_loss = None

    for batch, target in training_data:
        # Performs forward pass, loss function evaluation,
        # backward pass and weight update in one go on the device.
        _, loss = poptorch_model(batch, target)

        if momentum_loss is None:
            momentum_loss = loss
        else:
            momentum_loss = momentum_loss * 0.95 + loss * 0.05

        # Optimizer can be updated via setOptimizer.
        if momentum_loss < 0.1:
            poptorch_model.setOptimizer(
                torch.optim.AdamW(model.parameters(), lr=0.0001))
    # simple_ipu_end

    print(model.model.bias)
    assert (model.model.bias > 0.4 and model.model.bias < 0.6)

    # simple_cpu_start
    training_data = torch.utils.data.DataLoader(ExampleDataset(shape=[1],
                                                               length=20000),
                                                batch_size=10,
                                                shuffle=True,
                                                drop_last=True)

    model = ExampleModelWithLoss()
    model.train()

    optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

    momentum_loss = None

    for batch, target in training_data:
        # Zero gradients
        optimizer.zero_grad()

        # Run model.
        _, loss = model(batch, target)

        # Back propagate the gradients.
        loss.backward()

        # Update the weights.
        optimizer.step()

        if momentum_loss is None:
            momentum_loss = loss
        else:
            momentum_loss = momentum_loss * 0.95 + loss * 0.05

        if momentum_loss < 0.1:
            optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001)
    # simple_cpu_end

    print(model.model.bias)
    assert (model.model.bias > 0.4 and model.model.bias < 0.6)


if __name__ == "__main__":
    run_examples()


================================================
FILE: docs/user_guide/precompilation.py
================================================
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.
import sys
import poptorch

if not poptorch.ipuHardwareIsAvailable():
    sys.exit(0)

ipu_target_version = poptorch.ipuHardwareVersion()
filename = "training.poptorch"

# pylint: disable=unused-variable, wrong-import-position, reimported, ungrouped-imports, wrong-import-order
# precomp_start
import torch
import poptorch


class ExampleModelWithLoss(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = torch.nn.Linear(10, 10)
        self.loss = torch.nn.MSELoss()

    def forward(self, x, target=None):
        fc = self.fc(x)
        if self.training:
            return fc, self.loss(fc, target)
        return fc


torch.manual_seed(0)
model = ExampleModelWithLoss()

opts = poptorch.Options()
# You don't need a real IPU to compile the executable.
opts.useOfflineIpuTarget(ipu_target_version)

# Wrap the model in our PopTorch annotation wrapper.
poptorch_model = poptorch.trainingModel(model, opts)

# Some dummy inputs.
input = torch.randn(10)
target = torch.randn(10)

poptorch_model.compileAndExport(filename, input, target)
# precomp_end
poptorch_model.destroy()

# load_start
poptorch_model = poptorch.load(filename)

# That's all: your model is ready to be used.
poptorch_model(input, target)  # Run on IPU
# load_end
poptorch_model.destroy()


# load_setIpu_start
def setIpuDevice(opts):
    opts.useIpuId(1)  # always use IPU 1


poptorch_model = poptorch.load(filename, edit_opts_fn=setIpuDevice)
poptorch_model(input, target)  # Run on IPU 1
# load_setIpu_end

# precomp_no_python_start
poptorch_model.compileAndExport(filename, input, target, export_model=False)
# precomp_no_python_end

poptorch_model.destroy()

# load_exe_start
model = ExampleModelWithLoss()

opts = poptorch.Options()

# Wrap the model in our PopTorch annotation wrapper.
poptorch_model = poptorch.trainingModel(model, opts)
poptorch_model.loadExecutable(filename)

# Some dummy inputs.
input = torch.randn(10)
target = torch.randn(10)

poptorch_model(input, target)  # Run on IPU
# load_exe_end
poptorch_model.destroy()

# precomp_train_val_start
model = ExampleModelWithLoss()

opts = poptorch.Options()

# Some dummy inputs.
input = torch.randn(10)
target = torch.randn(10)

# Wrap the model in our PopTorch annotation wrapper.
training_model = poptorch.trainingModel(model, opts)
training_model.compileAndExport("training.poptorch", input, target)
model.eval()
validation_model = poptorch.inferenceModel(model, opts)
validation_model.compileAndExport("validation.poptorch", input)
# precomp_train_val_end

epochs = range(2)


def run_training(_):
    pass


def run_validation(_):
    pass


# implicit_cp_start
model = ExampleModelWithLoss()

opts = poptorch.Options()

# Wrap the model in our PopTorch annotation wrapper.
training_model = poptorch.trainingModel(model, opts)
model.eval()
validation_model = poptorch.inferenceModel(model, opts)

# Some dummy inputs.
input = torch.randn(10)
target = torch.randn(10)

# Train the model:
for epoch in epochs:
    training_model(input, target)

# Weights are implicitly copied from the training model
# to the validation model
prediction = validation_model(input)
# implicit_cp_end
training_model.destroy()
validation_model.destroy()

# load_train_val_start
training_model = poptorch.load("training.poptorch")
validation_model = poptorch.load("validation.poptorch")

for epoch in epochs:
    print("Epoch ", epoch)
    run_training(training_model)
    # Need to explicitly copy weights between the two models
    # because they're not connected anymore.
    training_model.copyWeightsToHost()
    validation_model.copyWeightsToDevice()
    run_validation(validation_model)
# load_train_val_end
training_model.destroy()
validation_model.destroy()

# load_train_val_connected_start
training_model = poptorch.load("training.poptorch")
# Create a validation python model based on the training model
validation_model = poptorch.inferenceModel(training_model)
validation_model.model.eval()
# Load the executable for that model:
validation_model.loadExecutable("validation.poptorch")

for epoch in epochs:
    print("Epoch ", epoch)
    run_training(training_model)
    # Nothing to do: training_model and validation_model are now connected
    # and PopTorch will implicitly keep the weights in sync between them.
    run_validation(validation_model)
# load_train_val_connected_end
training_model.destroy()
validation_model.destroy()


================================================
FILE: docs/user_guide/pytorch_to_poptorch.rst
================================================
========================
From PyTorch to PopTorch
========================

This page will introduce the key features that enable training on the IPU, and how they differ from native PyTorch.

.. note::
   PopTorch compiles a ``torch.nn.Model`` model for the IPU when it is wrapped in either a :py:func:`~poptorch.trainingModel` or :py:func:`~poptorch.inferenceModel`, as appropriate.
   This provides similar functionality to ``torch.compile`` but with more flexibility to generate optimal code for the IPU.
   Also, ``torch.compile`` does not pass options to a custom compiler backend.

   For these reasons, we do not currently support ``torch.compile``. Any calls to ``torch.compile`` should be replaced by wrapping the model with either :py:func:`~poptorch.trainingModel` or :py:func:`~poptorch.inferenceModel`. These functions perform static `compilation of the whole graph <https://docs.graphcore.ai/projects/ipu-programmers-guide/en/latest/programming_tools.html#compilation>`__ to produce optimized code to run on the IPU. The compilation of multiple partial graphs is not supported.


Preparing your data
===================

Data loading in PyTorch is typically handled using `torch.utils.data.DataLoader <https://pytorch.org/docs/1.10.0/data.html#torch.utils.data.DataLoader>`_.

PopTorch extends PyTorch's DataLoader with a :py:class:`~poptorch.DataLoader` to enable efficient data batching with respect to PopTorch's underlying machine learning framework, PopART.
Instantiation is almost identical to PyTorch, but you must remember to pass an instance of :py:class:`~poptorch.Options`.

+-------------------------------------------------+
| PyTorch                                         |
+-------------------------------------------------+
| .. literalinclude:: poptorch_training_simple.py |
|   :lines: 1-5                                   |
|   :start-after: simple_cpu_start                |
|   :end-before: simple_cpu_end                   |
|   :dedent: 4                                    |
+-------------------------------------------------+
| PopTorch                                        |
+-------------------------------------------------+
| .. literalinclude:: poptorch_training_simple.py |
|   :lines: 1-9                                   |
|   :start-after: simple_ipu_start                |
|   :end-before: simple_ipu_end                   |
|   :dedent: 4                                    |
+-------------------------------------------------+

For more information about how to set :py:class:`~poptorch.Options`, see :numref:`efficient_data_batching`.

Creating your model
===================

Training
--------

If you want to create a model for training on the IPU, you first need to wrap your model
in a PyTorch model that returns a tuple containing two elements: the outputs of the model
and the loss.

+-------------------------------------------------+
| PopTorch                                        |
+-------------------------------------------------+
| .. literalinclude:: poptorch_training_simple.py |
|   :start-after: model_with_loss_start           |
|   :end-before: model_with_loss_end              |
+-------------------------------------------------+

Then all you need to do is instantiate a :py:func:`~poptorch.trainingModel`,
by passing your new PyTorch model, :py:class:`~poptorch.Options`, and optimizer.

+-------------------------------------------------+
| PyTorch                                         |
+-------------------------------------------------+
| .. literalinclude:: poptorch_training_simple.py |
|   :lines: 7-10                                  |
|   :start-after: simple_cpu_start                |
|   :end-before: simple_cpu_end                   |
|   :dedent: 4                                    |
+-------------------------------------------------+
| PopTorch                                        |
+-------------------------------------------------+
| .. literalinclude:: poptorch_training_simple.py |
|   :lines: 11-19                                 |
|   :start-after: simple_ipu_start                |
|   :end-before: simple_ipu_end                   |
|   :dedent: 4                                    |
|   :emphasize-lines: 7-9                         |
+-------------------------------------------------+

Inference
---------

For inference, it's even easier. Just instantiate an :py:func:`~poptorch.inferenceModel` by passing your PyTorch model.

.. code-block:: python

  poptorch_model = poptorch.inferenceModel(model)

The training loop
=================

A simple training loop in PyTorch will typically consist of:

- Setting gradients to zero
- Performing a forwards pass with the model (and obtaining the loss)
- Performing the backwards pass with respect to the loss, and updating weights
- Updating the optimizer

In PopTorch, these steps are combined into a single step

+-------------------------------------------------+
| PyTorch                                         |
+-------------------------------------------------+
| .. literalinclude:: poptorch_training_simple.py |
|   :lines: 14-25                                 |
|   :start-after: simple_cpu_start                |
|   :end-before: simple_cpu_end                   |
|   :dedent: 4                                    |
+-------------------------------------------------+
| PopTorch                                        |
+-------------------------------------------------+
| .. literalinclude:: poptorch_training_simple.py |
|   :lines: 23-26                                 |
|   :start-after: simple_ipu_start                |
|   :end-before: simple_ipu_end                   |
|   :dedent: 4                                    |
+-------------------------------------------------+

Multiple/custom losses
======================

If using multiple losses, or when creating a custom loss, the final loss must be marked explicitly using :py:func:`~poptorch.identity_loss`.

+----------------------------------------------------------------------+
| PyTorch                                                              |
+----------------------------------------------------------------------+
| .. code-block:: python                                               |
|                                                                      |
|   def custom_loss(output, target)                                    |
|       loss1 = torch.nn.functional.nll_loss(x, target)                |
|       loss2 = torch.nn.functional.nll_loss(x, target) * 5.0          |
|       return loss1 + loss2                                           |
+----------------------------------------------------------------------+
| PopTorch                                                             |
+----------------------------------------------------------------------+
| .. code-block:: python                                               |
|   :emphasize-lines: 4                                                |
|                                                                      |
|   def custom_loss(output, target)                                    |
|       loss1 = torch.nn.functional.nll_loss(x, target)                |
|       loss2 = torch.nn.functional.nll_loss(x, target) * 5.0          |
|       return poptorch.identity_loss(loss1 + loss2, reduction='none') |
+----------------------------------------------------------------------+

Optimizers
==========

One important thing to note about using optimizers in PopTorch is that the optimizer state is encapsulated within the PopTorch model.
As such, any change made to the optimizer outside of the model must be followed by a call to :py:meth:`poptorch_model.setOptimizer <poptorch.PoplarExecutor.setOptimizer>`,
passing in the updated optimizer.

.. warning:: PopTorch does not directly use the Python implementation of the optimizers. Built-in implementations are used in their place.
   This means that you cannot currently use custom optimizers. Subclassing a built-in optimizer will generate a warning. Any custom behaviour
   in a custom optimizer is unlikely to take effect, other than simply setting the existing attributes.

+-------------------------------------------------+
| PyTorch                                         |
+-------------------------------------------------+
| .. literalinclude:: poptorch_training_simple.py |
|   :lines: 27-33                                 |
|   :start-after: simple_cpu_start                |
|   :end-before: simple_cpu_end                   |
|   :dedent: 4                                    |
|   :emphasize-lines: 7                           |
+-------------------------------------------------+
| PopTorch                                        |
+-------------------------------------------------+
| .. literalinclude:: poptorch_training_simple.py |
|   :lines: 28-36                                 |
|   :start-after: simple_ipu_start                |
|   :end-before: simple_ipu_end                   |
|   :dedent: 4                                    |
|   :emphasize-lines: 8-9                         |
+-------------------------------------------------+

.. note:: PopTorch also provides its own set of optimizers that can be accessed via ``poptorch.optim``.
   These are wrapper classes which have several advantages over the native PyTorch optimizers. They embed constant attributes
   for performance/memory savings and allow you to specify additional parameters such as loss scaling and velocity scaling.
   See :numref:`optimizers` for more information.

Going further
=============

For a more detailed example of getting started with PopTorch, see the :tutorials-repo:`PyTorch basics tutorial <tutorials/pytorch/basics>` which walks through training an MNIST model on the IPU.


================================================
FILE: docs/user_guide/reference.rst
================================================
.. _reference:

=============
API reference
=============

.. _api_options:

Options
=======

.. autoclass:: poptorch.Options
   :members:

.. autoclass:: poptorch.options._DistributedOptions
   :members:

.. autoclass:: poptorch.options._PrecisionOptions
   :members:

.. autoclass:: poptorch.options._JitOptions
   :members:

.. autoclass:: poptorch.options._TensorLocationOptions
   :members:

.. autoclass:: poptorch.TensorLocationSettings
   :members:

.. autoclass:: poptorch.options._TrainingOptions
   :members:

Helpers
=======

.. autofunction:: poptorch.ipuHardwareIsAvailable

.. autofunction:: poptorch.ipuHardwareVersion

.. autofunction:: poptorch.setLogLevel

.. autoclass:: poptorch.profiling.Channel
   :members:

PopTorch ops
============

.. autofunction:: poptorch.ctc_beam_search_decoder

.. autofunction:: poptorch.ipu_print_tensor

.. autofunction:: poptorch.for_loop

.. autofunction:: poptorch.recomputationCheckpoint

.. autofunction:: poptorch.identity_loss

.. autoclass:: poptorch.MultiConv
   :members:

.. autoclass:: poptorch.CPU
   :special-members: __init__

.. autoclass:: poptorch.NameScope
   :members:

.. autoclass:: poptorch.MultiConvPlanType

.. autoclass:: poptorch.custom_op

.. autofunction:: poptorch.nop

.. autofunction:: poptorch.dynamic_slice

.. autofunction:: poptorch.dynamic_update

.. autofunction:: poptorch.serializedMatMul

.. autofunction:: poptorch.set_available_memory

.. autofunction:: poptorch.set_overlap_for_input

.. autofunction:: poptorch.set_overlap_for_output

.. autofunction:: poptorch.nearest

.. autofunction:: poptorch.fps

.. autofunction:: poptorch.cond

Model wrapping functions
========================

.. autofunction:: poptorch.trainingModel

.. autofunction:: poptorch.inferenceModel

.. autoclass:: poptorch.PoplarExecutor
   :special-members: __call__
   :members:

.. autofunction:: poptorch.isRunningOnIpu

.. autofunction:: poptorch.load

Parallel execution
==================

.. autoclass:: poptorch.Block
   :special-members: __init__, useAutoId

.. autoclass:: poptorch.BeginBlock
   :special-members: __init__

.. autofunction:: poptorch.BlockFunction

.. autofunction:: poptorch.removeBlocks

.. autoclass:: poptorch.Stage
   :special-members: __init__

.. autoclass:: poptorch.AutoStage

.. autoclass:: poptorch.Phase
   :special-members: __init__

.. autoclass:: poptorch.ShardedExecution
   :inherited-members:

.. autoclass:: poptorch.PipelinedExecution
   :special-members: __init__
   :inherited-members:

.. autoclass:: poptorch.SerialPhasedExecution
   :special-members: __init__
   :inherited-members:

.. autoclass:: poptorch.ParallelPhasedExecution
   :special-members: __init__
   :inherited-members:

.. autoclass:: poptorch.Liveness

.. autoclass:: poptorch.CommGroupType

.. autoclass:: poptorch.VariableRetrievalMode

.. py:function:: replicaGrouping

   Call this function on a weight tensor (after applying a PopTorch wrapper with
   :py:func:`~poptorch.inferenceModel` or :py:func:`~poptorch.trainingModel`)
   to configure replica groups which each receive a different value of the
   weight tensor. For details and a code example see
   :numref:`grouping_tensor_weights`.

   :param comm_group_type: The replica group arrangement to use for this tensor.
   :type comm_group_type: poptorch.CommGroupType
   :param shards: The number of replicas in each replica group.
   :type shards: int
   :param variable_retrieval_mode: The method to use when retrieving the value
                                   of this tensor from the replicas.
   :type variable_retrieval_mode: poptorch.VariableRetrievalMode

Optimizers
==========

.. autoclass:: poptorch.optim.VariableAttributes
   :members:

.. autoclass:: poptorch.optim.SGD
   :special-members: __init__
   :members:

.. autoclass:: poptorch.optim.Adam
   :special-members: __init__
   :members:

.. autoclass:: poptorch.optim.AdamW
   :special-members: __init__
   :members:

.. autoclass:: poptorch.optim.RMSprop
   :special-members: __init__
   :members:

.. autoclass:: poptorch.optim.LAMB
   :special-members: __init__
   :members:

Data batching
=============

.. autoclass:: poptorch.DataLoader
   :special-members: __init__
   :members: terminate

.. autoclass:: poptorch.AsynchronousDataAccessor
   :special-members: __init__, __len__
   :members: terminate

.. autoclass:: poptorch.DataLoaderMode
   :members:

Enumerations
============

.. autoclass:: poptorch.SharingStrategy
   :members:

.. autoclass:: poptorch.OverlapMode
   :members:

.. autoclass:: poptorch.MatMulSerializationMode
   :members:

.. autoclass:: poptorch.SyncPattern
   :members:

.. autoclass:: poptorch.ReductionType
   :members:

.. autoclass:: poptorch.ConnectionType
   :members:

.. autoclass:: poptorch.OutputMode
   :members:

.. autoclass:: poptorch.MeanReductionStrategy
   :members:


================================================
FILE: docs/user_guide/replica_grouped_weights.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.

import numpy
import torch
import poptorch


# groupedweights_start
class ModelWithLoss(torch.nn.Module):
    def __init__(self, W_init):
        super().__init__()
        self.W = torch.nn.Parameter(W_init)

    def forward(self, X):
        Z = X @ self.W
        return Z, poptorch.identity_loss(Z**2, reduction="mean")


# Split the weight tensor into 4, and the input data tensor into 2.
tensor_shards = 4
data_shards = 2

# Set up the problem
random = numpy.random.RandomState(seed=100)
prob_X = random.normal(size=(24, 40)).astype(numpy.float32)
prob_W_init = random.normal(size=(40, 56)).astype(
    numpy.float32) * (5 * 8)**-0.5
prob_steps = 4

X = torch.tensor(prob_X)

# Run on 8 IPUs
W_init = torch.tensor(
    prob_W_init.reshape(prob_W_init.shape[0], tensor_shards,
                        prob_W_init.shape[1] // tensor_shards).transpose(
                            1, 0, 2)).contiguous()
m = ModelWithLoss(W_init)
optim = torch.optim.SGD(m.parameters(), lr=0.01)

pt_opts = poptorch.Options()
pt_opts.replicationFactor(data_shards * tensor_shards)
pt_opts.inputReplicaGrouping(tensor_shards,
                             poptorch.enums.CommGroupType.Consecutive)
pt_opts.outputMode(poptorch.OutputMode.All)
pt_m = poptorch.trainingModel(m, optimizer=optim, options=pt_opts)
pt_m.W.replicaGrouping(poptorch.enums.CommGroupType.Orthogonal, data_shards,
                       poptorch.enums.VariableRetrievalMode.OnePerGroup)
pt_losses = []
if data_shards > 1:
    X = X.reshape(data_shards, X.shape[0] // data_shards, *X.shape[1:])
for _ in range(prob_steps):
    _, loss = pt_m(X)
    # We divide by the number of replicas because the mean is being
    # taken only over a part of the tensor on each replica, so we need to
    # divide by the number of replicas to get the correct mean.
    pt_losses.append(torch.sum(loss.detach()) / (data_shards * tensor_shards))
pt_losses = numpy.array(pt_losses)
pt_W_final = m.W.detach().numpy().transpose(1, 0, 2) \
              .reshape(prob_W_init.shape)

# groupedweights_end


================================================
FILE: docs/user_guide/sumAnchorReturnType.py
================================================
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.

import random
import torch
import poptorch

RAND_SEED = 8549


class ExampleClassDataset(torch.utils.data.Dataset):
    """ A dummy dataset with classes for emulating a classification task.

    All instances of a class, C, will correspond to R*V where
    R is a randomly generated rotation matrix, fixed for the whole dataset
    V = V_all + V_cls
    V_all is a vector of vec_length for which all elements are sampled
    from an i.i.d. normal distribution, V_all ~ N(0, 0.2).
    V_cls is a vector of vec_length such that
    V_cls[x] ~ N(1, 0.2), if x = C, (i.e. the class label)
             = 0, otherwise

    """

    def __init__(self, num_classes, vec_length, num_examples):
        super().__init__()
        assert vec_length >= num_classes

        random.seed(RAND_SEED)

        #Generate the class label at this point
        self.targets = [None] * num_examples
        for idx in range(num_examples):
            self.targets[idx] = random.randrange(num_classes)

        # To get R, make a random symmetric matrix and use eigenvalue
        # decomposition
        torch.manual_seed(RAND_SEED)
        R = torch.rand([vec_length, vec_length])
        R = R + R.transpose(0, 1)
        _, eigenvectors = torch.linalg.eig(R)
        self._R = eigenvectors.to(torch.float)

        # # For now, use identity for R
        # self._R = torch.eye(vec_length, vec_length)

        self._dist = torch.distributions.normal.Normal(0, 0.2)
        self._dist = self._dist.expand([vec_length])

        self._vec_length = vec_length

    def __getitem__(self, idx):
        torch.manual_seed(idx + RAND_SEED)
        v = self._dist.sample()
        item_cls = self.targets[idx]
        v[item_cls] += 1.0

        v = torch.matmul(self._R, v)

        return v, item_cls

    def __len__(self):
        return len(self.targets)


# yapf: disable
#model_returning_accuracy_start
class MulticlassPerceptron(torch.nn.Module):
    def __init__(self, vec_length, num_classes):
        super().__init__()
        self.fc = torch.nn.Linear(vec_length, num_classes)
        self.loss = torch.nn.CrossEntropyLoss()

    def forward(self, x, target):
        fc = self.fc(x)

        classification = torch.argmax(fc, dim=-1)
        accuracy = (torch.sum((classification == target).to(torch.float)) /
                    float(classification.numel()))

        if self.training:
            return self.loss(fc, target), accuracy

        return classification, accuracy
# model_returning_accuracy_end
# yapf: enable

NUM_CLASSES = 10
VEC_LENGTH = NUM_CLASSES * 2

# yapf: disable
#sum_accuracy_start
opts = poptorch.Options()

opts.deviceIterations(5)
opts.Training.gradientAccumulation(10)
opts.outputMode(poptorch.OutputMode.Sum)

training_data = poptorch.DataLoader(opts,
                                    dataset=ExampleClassDataset(
                                        NUM_CLASSES, VEC_LENGTH, 2000),
                                    batch_size=5,
                                    shuffle=True,
                                    drop_last=True)


model = MulticlassPerceptron(VEC_LENGTH, NUM_CLASSES)
model.train()

# Wrap the model in a PopTorch training wrapper
poptorch_model = poptorch.trainingModel(model,
                                        options=opts,
                                        optimizer=torch.optim.Adam(
                                            model.parameters()))

# Run over the training data, 5 batches at a time.
for batch_number, (data, labels) in enumerate(training_data):
    # Execute the device with a 5 iteration loop of batchsize 5 with 10
    # gradient accumulations (global batchsize = 5 * 10 = 50). "loss" and
    # "accuracy" will be the sum across all device iterations and gradient
    # accumulations but not across the model batch size.
    _, accuracy = poptorch_model(data, labels)

    # Correct for iterations
    # Do not divide by batch here, as this is already accounted for in the
    # PyTorch Model.
    accuracy /= (opts.device_iterations * opts.Training.gradient_accumulation)
    print(f"Accuracy: {float(accuracy)*100:.2f}%")
#sum_accuracy_end
# yapf: enable


================================================
FILE: docs/user_guide/supported_ops.rst
================================================
.. _supported_ops:

IPU supported operations
************************

Below is a list of currently supported operations that can be
executed on IPU hardware. This list will be expanded over time
as we add more support. Some overloads and modes of operation
for ops are not supported and we've tried to list all the caveats
but some may have been missed.


Torch operations
================

Tensor operations
-----------------

Many of the tensor operations will be executed before even reaching the IPU
so we can consider them supported anyway. Some, like ``contiguous()``, make
no sense on a distributed memory system like the IPU so are ignored. There
are no constraints on the memory format of how operations should be called
other than the constraint that all graph inputs should be contiguous.

We will also create tensor views. However, the aliasing property of views
with respect to in-place operations should not be relied on as we may have slightly different view behaviour.

Additionally, some PyTorch operations may be implemented by composition of
the listed ops but may not be explicitly listed but are in fact supported.


Creation ops
''''''''''''

* ``torch.arange``
* ``tensor.fill``
* ``torch.full``
* ``torch.full_like``
* ``torch.Tensor.new_ones``
* ``torch.Tensor.new_zeros``
* ``torch.ones``
* ``torch.ones_like``
* ``torch.zeros``
* ``torch.zeros_like``

Indexing, slicing, joining and mutating ops
'''''''''''''''''''''''''''''''''''''''''''

In PyTorch, slicing a tensor is accessing a subset of the tensor by providing the start and end indices, such as ``tensor[1:5]``.

With a PopTorch model, you may take a slice of a tensor only if one of two conditions are met:

* The start and end are constants, or can be resolved to be constants (for example, a function of the shape of a tensor which does not change between runs).
* The start and end of the slice are related by a constant, for example ``tensor[x:x+5]``. Please note that this will produce different results to PyTorch if the end value exceeds the length of the tensor: PyTorch will output a smaller size tensor but PopTorch will allow the slice to wrap round to the start of the relevant dimension.

PyTorch functions

* ``torch.bincount``
* ``torch.bucketize``
* ``torch.cat``
* ``torch.chunk``
* ``torch.gather``
* ``torch.index_select``
* ``torch.index_reduce``
* ``torch.reshape``
* ``torch.roll``
* ``torch.scatter``
* ``torch.scatter_add``
* ``torch.scatter_reduce``
* ``torch.stack``
* ``torch.split``
* ``torch.squeeze``
* ``torch.t``
* ``torch.take_along_dim``
* ``torch.transpose``
* ``torch.unbind``
* ``torch.unsqueeze``
* ``torch.where``

Tensor methods

* ``tensor.expand``
* ``tensor.expand_as``
* ``tensor.masked_fill``
* ``tensor.index_fill_``

Random samplers
'''''''''''''''
To set the random state, use ``poptorch.Options.randomSeed``

* ``torch.bernoulli``
* ``torch.distributions.Bernoulli``
* ``torch.randn``
* ``torch.normal``
* ``torch.distributions.Normal``
* ``torch.rand``
* ``torch.uniform``
* ``torch.distributions.Uniform``
* ``torch.exponential``
* ``torch.distributions.Exponential``

Math operations
---------------

Pointwise ops
'''''''''''''

* ``torch.abs``
* ``torch.acos``
* ``torch.acosh``
* ``torch.add``
* ``torch.addcdiv``
* ``torch.amax``
* ``torch.amin``
* ``torch.asin``
* ``torch.asinh``
* ``torch.atan``
* ``torch.atanh``
* ``torch.bitwise_and``
* ``torch.bitwise_not``
* ``torch.bitwise_or``
* ``torch.bitwise_xor``
* ``torch.cdist``
* ``torch.ceil``
* ``torch.clamp``
* ``torch.clamp_max``
* ``torch.clamp_min``
* ``torch.cos``
* ``torch.cosh``
* ``torch.div``
* ``torch.exp``
* ``torch.expm1``
* ``torch.floor``
* ``torch.floor_divide``
* ``torch.fmod``
* ``torch.frac``
* ``torch.log``
* ``torch.log10``
* ``torch.log1p``
* ``torch.log2``
* ``torch.logical_and``
* ``torch.logical_or``
* ``torch.mul``
* ``torch.norm``
* ``torch.neg``
* ``torch.pow``
* ``torch.reciprocal``
* ``torch.remainder``
* ``torch.round``
* ``torch.rsqrt``
* ``torch.sigmoid``
* ``torch.sign``
* ``torch.sin``
* ``torch.sinh``
* ``torch.sqrt``
* ``torch.square``
* ``torch.sub``
* ``torch.tan``
* ``torch.tanh``
* ``torch.true_divide``
* ``torch.trunc``


Reduction ops
'''''''''''''

* ``torch.all``
* ``torch.any``
* ``torch.argmax``
* ``torch.argmin``
* ``torch.count_nonzero``
* ``torch.mean``
* ``torch.median``
* ``torch.prod``
* ``torch.logsumexp``
* ``torch.std``
* ``torch.std_mean``
* ``torch.sum``
* ``torch.var``
* ``torch.var_mean``


Comparison ops
''''''''''''''

* ``torch.eq``
* ``torch.ge``
* ``torch.gt``
* ``torch.le``
* ``torch.lt``
* ``torch.max``
* ``torch.min``
* ``torch.ne``
* ``torch.isnan``
* ``torch.topk``

  * The option ``sorted=False`` is not supported for ``torch.topk``.

* ``torch.argsort``
* ``torch.randperm``
* ``torch.sort``


torch.linalg ops
''''''''''''''''

* ``torch.linalg.norm``

    2-norm and nuclear norm are unsupported for matrices.

* ``torch.linalg.matrix_norm``

    2-norm and nuclear norm are unsupported.

* ``torch.linalg.vector_norm``


Other ops
'''''''''

* ``torch.cumsum``
* ``torch.cumprod``
* ``torch.cross``
* ``torch.meshgrid``
* ``torch.cartesian_prod``
* ``torch.tensordot``


BLAS and LAPACK Operations
''''''''''''''''''''''''''

* ``torch.addmm``
* ``torch.matmul``
* ``torch.bmm``


Torch.nn operations
===================

Containers
----------

``torch.nn.Module`` and ``torch.nn.Sequential`` can be passed into our
compiler wrappers and just work.


Convolution layers
------------------

Conv transpose operations do not yet support dilations.

* ``torch.nn.Conv1d``
* ``torch.nn.Conv2d``
* ``torch.nn.Conv3d``
* ``torch.nn.ConvTranspose1d``
* ``torch.nn.ConvTranspose2d``
* ``torch.nn.ConvTranspose3d``


Pooling layers
--------------

Currently the max pool layers do not return the indices
so only the variants with ``return_indices=False`` are supported.

* ``torch.nn.MaxPool1d``
* ``torch.nn.MaxPool2d``
* ``torch.nn.MaxPool3d``
* ``torch.nn.AvgPool1d``
* ``torch.nn.AvgPool2d``
* ``torch.nn.AvgPool3d``
* ``torch.nn.AdaptiveAvgPool1d``
* ``torch.nn.AdaptiveAvgPool2d``
* ``torch.nn.AdaptiveAvgPool3d``

Padding layers
--------------

All padding layers are supported.

* ``torch.nn.ReflectionPad1d``
* ``torch.nn.ReflectionPad2d``
* ``torch.nn.ReplicationPad1d``
* ``torch.nn.ReplicationPad2d``
* ``torch.nn.ReplicationPad3d``
* ``torch.nn.ZeroPad2d``
* ``torch.nn.ConstantPad1d``
* ``torch.nn.ConstantPad2d``
* ``torch.nn.ConstantPad3d``


Activations
-----------

* ``torch.nn.ELU``
* ``torch.nn.CELU``
* ``torch.nn.GELU``
* ``torch.nn.Hardshrink``
* ``torch.nn.LeakyReLU``
* ``torch.nn.LogSoftmax``
* ``torch.nn.Mish``
* ``torch.nn.ReLU``
* ``torch.nn.SELU``
* ``torch.nn.SiLU``
* ``torch.nn.Sigmoid``
* ``torch.nn.Softmax``
* ``torch.nn.Softplus``
* ``torch.nn.Softsign``
* ``torch.nn.Softshrink``
* ``torch.nn.Tanh``
* ``torch.nn.PReLU``
* ``torch.nn.RReLU``
* ``torch.nn.Hardtanh``
* ``torch.nn.functional.glu``
* ``torch.nn.Threshold``


Normalization layers
--------------------

Currently only ``affine=True`` is supported as a parameter. That is to say, only the variants with trainable parameters are supported.

* ``torch.nn.BatchNorm1d``
* ``torch.nn.BatchNorm2d``
* ``torch.nn.BatchNorm3d``
* ``torch.nn.LayerNorm``
* ``torch.nn.GroupNorm``
* ``torch.nn.InstanceNorm1d``
* ``torch.nn.InstanceNorm2d``
* ``torch.nn.InstanceNorm3d``

* ``torch.nn.utils.weight_norm``

Recurrent layers
----------------

Bidirectional layers, non-zero dropout probabilities,
and setting ``num_layers`` to a value greater than 1
are not currently supported for any recurrent layer. In addition,
setting ``bias=False`` is currently only supported for ``torch.nn.GRU``.

* ``torch.nn.RNN``
* ``torch.nn.GRU``
* ``torch.nn.LSTM``

Linear layers
-------------

* ``torch.nn.Identity``
* ``torch.nn.Linear``
* ``torch.nn.Bilinear``

Dropout
-------

* ``torch.nn.dropout``

Sparse layers
-------------

Embedding and EmbeddingBag are supported with the exception of the ``padding_idx`` parameter
being unsupported.

* ``torch.nn.Embedding``
* ``torch.nn.EmbeddingBag``
* ``torch.nn.functional.one_hot``

Loss functions
--------------

This version supports a limited subset of loss functions. However, we support
:py:func:`~poptorch.identity_loss` which gives you the ability to implement any arbitrary
loss function.

.. seealso:: :py:func:`~poptorch.identity_loss`

One caveat for the following loss functions is if they are used they will always be included
in the back propagation and will always receive a gradient, which is a slight deviation from
normal PyTorch operations, where they have to opt in to the gradient pass.

* ``torch.nn.L1Loss``
* ``torch.nn.MSELoss``
* ``torch.nn.CrossEntropyLoss``
* ``torch.nn.NLLLoss``
* ``torch.nn.BCELoss``
* ``torch.nn.KLDivLoss``
* ``torch.nn.PoissonNLLLoss``
* ``torch.nn.HingeEmbeddingLoss``
* ``torch.nn.BCEWithLogitsLoss``
* ``torch.nn.SmoothL1Loss``
* ``torch.nn.SoftMarginLoss``
* ``torch.nn.CosineEmbeddingLoss``
* ``torch.nn.MarginRankingLoss``
* ``torch.nn.TripletMarginLoss``
* ``torch.nn.CTCLoss``

Vision Layers
-------------
Support nearest and bicubic mode.

* ``torch.nn.Upsample``


PyTorch Scatter functions

* ``torch_scatter.scatter``
* ``torch_scatter.composite.scatter_log_softmax``
* ``torch_scatter.composite.scatter_softmax``
* ``torch_scatter.composite.scatter_std``
* ``torch_scatter.composite.scatter_logsumexp``

PyTorch Spline Convolution functions

* ``torch_spline_conv.spline_basis``
* ``torch_spline_conv.spline_weighting``

.. _float_16_op_support:

16-bit float operations
=======================

.. warning::
   Handling of ``float16`` operations has been greatly simplified since PopTorch version 3.0. Please read this section
   carefully if you are used to the way this worked prior to version 3.0.

In PopTorch version 3.0 and later, ``float16`` operations are handled straightforwardly by the dispatcher frontend.
Tensors and models can be freely cast to and from ``float16``, and normalization running
statistics can also be retyped by simple casting.

If you have PopTorch code created with a previous version of PopTorch, see :numref:`float_16_migration`.

.. _float_16_migration:

16-bit float migration
======================

Legacy PopTorch code using ``float16`` can be updated for the dispatcher frontend by considering the following points:

* Casts were not well supported by the tracing frontend. They are fully supported by the dispatcher frontend.

* ``opts.Precision.halfFloatCasting()`` was used to switch between ways of resolving ops with
  both ``float32`` and ``float16`` inputs (mixed-precision inputs), either by upcasting the inputs to ``float32``, or by
  downcasting them to ``float16``. This option is not supported under the dispatcher frontend: mixed precision ops are
  now always upcast to ``float32``, in accordance with normal PyTorch behaviour. To recreate the effect of
  ``opts.Precision.halfFloatCasting(poptorch.HalfFloatCastingBehavior.FloatDowncastToHalf)``,
  which was the default behaviour with the tracing frontend, ``float32`` inputs to mixed-precision ops should be
  explicitly cast to ``float16`` before being passed to the op.

* ``opts.Precision.runningStatisticsAlwaysFloat()`` was used to cause the running mean and variance of certain
  normalization ops to be calculated in ``float32`` precision, even though the normalization module itself had been cast
  to ``float16``. This option is not supported in the dispatcher frontend, as the same effect can be achieved by simply
  casting the running statistic tensors back to ``float32`` before running the model.

Gradient computation control
============================

``torch.no_grad`` is supported as a context manager as well as a decorator to suppress the
computation of gradients locally.


================================================
FILE: docs/user_guide/trainingModel.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
# training_model_start
import torch
import poptorch


class ExampleModelWithLoss(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = torch.nn.Linear(10, 10)
        self.loss = torch.nn.MSELoss()

    def forward(self, x, target=None):
        fc = self.fc(x)
        if self.training:
            return fc, self.loss(fc, target)
        return fc


torch.manual_seed(0)
model = ExampleModelWithLoss()

# Wrap the model in our PopTorch annotation wrapper.
poptorch_model = poptorch.trainingModel(model)

# Some dummy inputs.
input = torch.randn(10)
target = torch.randn(10)
ones = torch.ones(10)

# Train on IPU.
for i in range(0, 800):
    # Each call here executes the forward pass, loss calculation, and backward
    # pass in one step.
    # Model input and loss function input are provided together.
    poptorch_out, loss = poptorch_model(input, target)
    print(f"{i}: {loss}")

# Copy the trained weights from the IPU back into the host model.
poptorch_model.copyWeightsToHost()

# Execute the trained weights on host.
model.eval()
native_out = model(input)

# Models should be very close to native output although some operations are
# numerically different and floating point differences can accumulate.
torch.testing.assert_close(native_out, poptorch_out, rtol=1e-04, atol=1e-04)
# training_model_end
Model = ExampleModelWithLoss


def train(model):
    # Dummy single training step on IPU
    model(input, target)


def train_on_cpu(model):
    # Dummy single training step on CPU
    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
    _, loss = model(input, target)
    loss.backward()
    optimizer.step()


def validate(model):
    # Dummy validate step
    print(model(ones))


# explicit_copy_start
model = Model()
model.eval()

poptorch_inf = poptorch.inferenceModel(model)

# Switch for "poptorch.trainingModel": poptorch_inf will remain in "eval" mode
model.train()
poptorch_train = poptorch.trainingModel(model)

# train on IPU
train(poptorch_train)
torch.save(model.state_dict(), "model.save")  # OK

# Aready in "eval" mode
validate(poptorch_inf)  # OK

# switch to "eval" mode for CPU
model.eval()
validate(model)  # OK

# train on CPU
model.train()
train_on_cpu(model)

# Explicit copy needed
poptorch_inf.copyWeightsToDevice()
validate(poptorch_inf)
# explicit_copy_end


================================================
FILE: examples/CMakeLists.txt
================================================
function(add_poptorch_py_example name path)
  message(STATUS "Adding python example '${name}'")

  set(extra_labels "")
  if("${name}" STREQUAL "bert_ipu")
    set(extra_labels ";external_data")
  else()
    set(extra_labels ";short")
  endif()
  add_test(NAME "${name}_example"
           COMMAND python3 ${path}/${name}.py
           WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
  set_tests_properties("${name}_example" PROPERTIES LABELS "examples${extra_labels}")
endfunction()

file(GLOB EXAMPLES "${CMAKE_CURRENT_SOURCE_DIR}/*.py")
if(COPY_TESTS)
  install(FILES ${EXAMPLES} DESTINATION "${CMAKE_CURRENT_BINARY_DIR}")
  set(EXAMPLES_PATH "${CMAKE_CURRENT_BINARY_DIR}")
else()
  set(EXAMPLES_PATH "${CMAKE_CURRENT_SOURCE_DIR}")
endif()

foreach(EXAMPLE ${EXAMPLES})
  get_filename_component(NAME ${EXAMPLE} NAME_WE)
  add_poptorch_py_example(${NAME} ${EXAMPLES_PATH})
endforeach()


================================================
FILE: examples/bert_ipu.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import os
import transformers
import torch
import poptorch

if not poptorch.ipuHardwareIsAvailable():
    os.environ["POPTORCH_IPU_MODEL"] = "1"

tokenizer = transformers.BertTokenizer.from_pretrained(
    'mrm8488/bert-medium-finetuned-squadv2', return_token_type_ids=True)


# For later versions of transformers, we need to wrap the model and set
# return_dict to False
class WrappedModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.wrapped = transformers.BertForQuestionAnswering.from_pretrained(
            'mrm8488/bert-medium-finetuned-squadv2')

    def forward(self, input_ids, attention_mask):
        return self.wrapped.forward(input_ids,
                                    attention_mask,
                                    return_dict=False)

    def __getattr__(self, attr):
        try:
            return torch.nn.Module.__getattr__(self, attr)
        except AttributeError:
            return getattr(self.wrapped, attr)


model = WrappedModel()

context = """Scotland is a country that is part of the United Kingdom. Covering the northern third of
            the island of Great Britain, mainland Scotland has a 96 mile (154 km) border with England
            to the southeast and is otherwise surrounded by the Atlantic Ocean to the north and west,
            the North Sea to the northeast and the Irish Sea to the south. In addition, Scotland includes
            more than 790 islands; principally within the Northern Isles and the Hebrides archipelagos."""

questions = [
    "How many islands are there in Scotland?",
    "What sea is to the south of Scotland",
    "Where is England in relation to Scotland?",
    "How long is the border between England and Scotland?"
]

batches = len(questions)

# Pipeline the model over two IPUs. You must have at least as many batches (questions) as you have IPUs.
model.wrapped.bert.embeddings.position_embeddings = poptorch.BeginBlock(
    model.wrapped.bert.embeddings.position_embeddings, ipu_id=1)

# Mark model for inference.
opts = poptorch.Options().deviceIterations(batches)
inference_model = poptorch.inferenceModel(model, opts)

# Batch by the number of iterations so we fill the pipeline.
encoding, input_ids, attention_mask = [None] * batches, [[None]] * batches, [
    None
] * batches

# Encode the query and context.
batch_list, atten_list = [], []

# Encode each question for the IPU.
for i in range(0, batches):
    encoding[i] = tokenizer.encode_plus(questions[i],
                                        context,
                                        max_length=110,
                                        pad_to_max_length='right')
    input_ids[i], attention_mask[i] = encoding[i]["input_ids"], encoding[i][
        "attention_mask"]
    batch_list.append(input_ids[i])
    atten_list.append(attention_mask[i])

input_batch = torch.tensor(batch_list)
attention_batch = torch.tensor(atten_list)

print(input_batch.size())
# Execute on IPU.
start_score_pop, end_scores_pop = inference_model(input_batch, attention_batch)

print("Context: " + context)
index = 0
for start_score, end_score in zip(start_score_pop, end_scores_pop):
    answer_ids = input_ids[index][torch.argmax(start_score
                                               ):torch.argmax(end_score) + 1]
    answer_tokens = tokenizer.convert_ids_to_tokens(answer_ids,
                                                    skip_special_tokens=True)
    answer = tokenizer.convert_tokens_to_string(answer_tokens)

    print("Question : " + questions[index])
    print("Answer : " + answer)

    index += 1


================================================
FILE: examples/lstm.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import torch
import torch.nn as nn
import poptorch


class SimpleLSTM(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm = nn.LSTM(3, 3)

    def forward(self, input_tensors, hidden):
        Y, (Y_h, Y_c) = self.lstm(input_tensors, hidden)
        return Y, (Y_h, Y_c)


inputs = [torch.randn(1, 3) for _ in range(5)]
# Add the extra 2nd dimension
inputs = torch.cat(inputs).view(len(inputs), 1, -1)
hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3))  # clean out hidden state

inference_lstm = poptorch.inferenceModel(SimpleLSTM())
out, hidden = inference_lstm(inputs, hidden)

print(out)
print(hidden)


================================================
FILE: examples/mnist.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.


# pylint: disable=too-many-statements
def example():
    # pylint: disable=import-outside-toplevel
    import sys
    import poptorch
    if not poptorch.ipuHardwareIsAvailable():
        poptorch.logger.warn("This examples requires IPU hardware to run")
        sys.exit(0)

    # pylint: disable=unused-variable, wrong-import-position, reimported, ungrouped-imports, wrong-import-order, import-outside-toplevel
    # mnist_start
    import torch
    import torch.nn as nn
    import torchvision
    import poptorch

    # Normal pytorch batch size
    training_batch_size = 20
    validation_batch_size = 100

    opts = poptorch.Options()
    # Device "step"
    opts.deviceIterations(20)

    # How many IPUs to replicate over.
    opts.replicationFactor(4)

    opts.randomSeed(42)

    # Load MNIST normally.
    training_data = poptorch.DataLoader(
        opts,
        torchvision.datasets.MNIST('mnist_data/',
                                   train=True,
                                   download=True,
                                   transform=torchvision.transforms.Compose([
                                       torchvision.transforms.ToTensor(),
                                       torchvision.transforms.Normalize(
                                           (0.1307, ), (0.3081, ))
                                   ])),
        batch_size=training_batch_size,
        shuffle=True)

    # Load MNIST normally.
    val_options = poptorch.Options()
    validation_data = poptorch.DataLoader(
        val_options,
        torchvision.datasets.MNIST('mnist_data/',
                                   train=True,
                                   download=True,
                                   transform=torchvision.transforms.Compose([
                                       torchvision.transforms.ToTensor(),
                                       torchvision.transforms.Normalize(
                                           (0.1307, ), (0.3081, ))
                                   ])),
        batch_size=validation_batch_size,
        shuffle=True,
        drop_last=True)

    # A helper block to build convolution-pool-relu blocks.
    class Block(nn.Module):
        def __init__(self, in_channels, num_filters, kernel_size, pool_size):
            super(Block, self).__init__()
            self.conv = nn.Conv2d(in_channels,
                                  num_filters,
                                  kernel_size=kernel_size)
            self.pool = nn.MaxPool2d(kernel_size=pool_size)
            self.relu = nn.ReLU()

        def forward(self, x):
            x = self.conv(x)
            x = self.pool(x)
            x = self.relu(x)
            return x

    # Define the network using the above blocks.
    class Network(nn.Module):
        def __init__(self):
            super().__init__()
            self.layer1 = Block(1, 10, 5, 2)
            self.layer2 = Block(10, 20, 5, 2)
            self.layer3 = nn.Linear(320, 256)
            self.layer3_act = nn.ReLU()
            self.layer4 = nn.Linear(256, 10)

            self.softmax = nn.LogSoftmax(1)
            self.loss = nn.NLLLoss(reduction="mean")

        def forward(self, x, target=None):
            x = self.layer1(x)
            x = self.layer2(x)
            x = x.view(-1, 320)

            x = self.layer3_act(self.layer3(x))
            x = self.layer4(x)
            x = self.softmax(x)

            if target is not None:
                loss = self.loss(x, target)
                return x, loss
            return x

    # Create our model.
    model = Network()

    # Create model for training which will run on IPU.
    training_model = poptorch.trainingModel(model, training_data.options)

    # Same model as above, they will share weights (in 'model') which once training is finished can be copied back.
    inference_model = poptorch.inferenceModel(model, validation_data.options)

    def train():
        for batch_number, (data, labels) in enumerate(training_data):
            output, losses = training_model(data, labels)

            if batch_number % 10 == 0:
                print(f"PoptorchIPU loss at batch: {batch_number} is {losses}")

                # Pick the highest probability.
                _, ind = torch.max(output, 1)
                assert training_model.options.output_mode in (
                    poptorch.OutputMode.All, poptorch.OutputMode.Final
                ), "Only 'Final' and 'All' OutputMode supported"
                # If we're using Final: only keep the last labels, no-op if using All
                num_labels = ind.shape[0]
                labels = labels[-num_labels:]
                eq = torch.eq(ind, labels)
                elms, counts = torch.unique(eq,
                                            sorted=False,
                                            return_counts=True)

                acc = 0.0
                if len(elms) == 2:
                    if elms[0]:
                        acc = (counts[0].item() / num_labels) * 100.0
                    else:
                        acc = (counts[1].item() / num_labels) * 100.0

                print(
                    f"Training accuracy: {acc}% from batch of size {num_labels}"
                )
        print("Done training")

    def test():
        correct = 0
        total = 0
        with torch.no_grad():
            for (data, labels) in validation_data:
                output = inference_model(data)

                # Argmax the probabilities to get the highest.
                _, ind = torch.max(output, 1)

                # Compare it against the ground truth for this batch.
                eq = torch.eq(ind, labels)

                # Count the number which are True and the number which are False.
                elms, counts = torch.unique(eq,
                                            sorted=False,
                                            return_counts=True)

                if len(elms) == 2 or elms[0]:
                    if elms[0]:
                        correct += counts[0].item()
                    else:
                        correct += counts[1].item()

                total += validation_batch_size
        print("Validation: of " + str(total) + " samples we got: " +
              str((correct / total) * 100.0) + "% correct")

    # Train on IPU.
    train()

    test()
    # mnist_end


# AsynchronousDataAccessor must run in the main process
if __name__ == "__main__":
    example()


================================================
FILE: examples/simple_adder.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import torch
import torch.nn as nn
import poptorch

# This simple example demonstrates compiling a model to add
# two tensors together using the IPU.


class SimpleAdder(nn.Module):
    def forward(self, x, y):
        return x + y


model = SimpleAdder()
inference_model = poptorch.inferenceModel(model)

t1 = torch.tensor([1.])
t2 = torch.tensor([2.])

assert inference_model(t1, t2) == 3.0
print("Success")


================================================
FILE: popart_compiler/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
project(popart_compiler)

find_package(popef REQUIRED)

add_library(popart_compiler_types INTERFACE)
target_include_directories(popart_compiler_types INTERFACE types/include)

add_library(popart_compiler SHARED
  "source/CodeletsCompilation.cpp"
  "source/Compiler.cpp"
  "source/CompilerImpl.cpp"
  "source/Utils.cpp"
  "source/SessionOptions.cpp"
  "source/custom_operations/Embedding.cpp"
  "source/custom_operations/FastGatherLastDim.cpp"
  "source/custom_operations/HostOp.cpp"
  "source/custom_operations/TorchSoftplus.cpp"
  "source/custom_operations/UpsampleBilinear2d.cpp"
)

file(GLOB_RECURSE popart_compiler_public_headers "${CMAKE_CURRENT_SOURCE_DIR}/include/*.hpp*" "${CMAKE_CURRENT_SOURCE_DIR}/types/include/*.hpp*")
set_target_properties(popart_compiler PROPERTIES
  CXX_STANDARD 14
  PUBLIC_HEADER "${popart_compiler_public_headers}")

target_link_libraries(popart_compiler PUBLIC
                                      popart_compiler_types
                                      PRIVATE
                                      popef
                                      popart-only
                                      poptorch_logging
                                      poptorch_exception_info
                                      poprithms)

target_include_directories(popart_compiler
                           PUBLIC
                           $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
                           $<INSTALL_INTERFACE:include>
                           PRIVATE
                           source/include)

# Copy custom codelet sources so that we can install and later pre-compile them
# on-demand, configure_file keeps track of changes and always copies on new
# version. Custom codelets are also copied into the python package during wheel
# creation.
set(CUSTOM_CODELETS
  "UpsampleBilinear2dCodelets.inc.cpp"
  "FastGatherLastDimFwdCodelets.inc.cpp"
  "FastGatherLastDimBwdCodelets.inc.cpp"
)

foreach(SRC ${CUSTOM_CODELETS})
  configure_file(source/custom_operations/${SRC} ${SRC} COPYONLY)
endforeach()

install(TARGETS popart_compiler
  LIBRARY
    DESTINATION ${CMAKE_INSTALL_LIBDIR}
  PUBLIC_HEADER
    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/popart_compiler
  )

foreach(SRC ${CUSTOM_CODELETS})
  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${SRC}
  DESTINATION ${INSTALL_PYDIR})
endforeach()


================================================
FILE: popart_compiler/include/popart_compiler/CodeletsCompilation.hpp
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
#ifndef POPART_COMPILER_CODELETS_COMPILATION_HPP
#define POPART_COMPILER_CODELETS_COMPILATION_HPP

#include <memory>

namespace poptorch {
namespace popart_compiler {

// Called from python on each 'import poptorch'. Cache path is expected to be
// a true filesystem path of the installed python package where codelet sources
// are stored.
void setCustomCodeletsPath(const char *cache_path);

// Compile a custom codelet (if not already compiled) and store the output
// file to the path specified with 'setCustomCodeletsPath' above. This can
// safely be called from multiple threads/processes.
std::unique_ptr<char[]> compileCustomCodeletIfNeeded(const char *src_file_name,
                                                     bool hw_only_codelet);

} // namespace popart_compiler
} // namespace poptorch

#endif // POPART_COMPILER_CODELETS_COMPILATION_HPP


================================================
FILE: popart_compiler/include/popart_compiler/Compiler.hpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#pragma once

#include <map>
#include <memory>
#include <set>
#include <unordered_map>
#include <utility>
#include <vector>

#include "popart_compiler/CompilerTypes.hpp"
#include "poptorch_logging/LoggingLight.hpp"

namespace popart {
class any;
enum class DataType;
class ConstVoidData;
} // namespace popart

namespace poptorch {
namespace popart_compiler {

namespace detail {
struct CompilerImpl;
struct SessionOptionsImpl;
} // namespace detail

void throwTestError(TestErrorType type);

// Examines the supplied exception. If it is a popart or poplar exception,
// rethrow it as an ExceptionInfo subclass (which gives easy access to the
// exception detail)
void rethrowPopartOrPoplarException(const std::exception_ptr &eptr,
                                    const char *filename, uint64_t line);

void setPopartLogLevel(logging::Level level);

// Copies the value and constness of one parameter to another
void copyParam(Optimizer &dest_optim, const Optimizer &source_optim,
               const char *source, const char *dest);

class Compiler;
class SessionOptions {
public:
  SessionOptions();
  SessionOptions(SessionOptions &&);
  ~SessionOptions();
  // Disable copy: Move only
  SessionOptions(const SessionOptions &) = delete;
  SessionOptions &operator=(const SessionOptions &) = delete;

  void setMemoryProportion(std::uint32_t ipu, float memory);
  void setPatternsLevel(std::uint64_t level);
  void addPattern(const char *pattern, bool enabled);
  void setTensorLocation(const char *tensor, const char *option,
                         std::uint64_t value);
  void
  setCompilationProgressLogger(const std::function<void(int, int)> &logger);

  void addStringOption(const char *option, const char *value);
  void addUint64Option(const char *option, std::uint64_t value);
  void addBoolOption(const char *option, bool value);
  void addDoubleOption(const char *option, double value);
  // Insert a string option in an option container (set / list / vector)
  void insertStringOption(const char *option, const char *value);
  // Insert a key / value pair in an option map
  void insertStringPairOption(const char *option, const char *key,
                              const char *value);

  bool broadcastBuffers() const;
  bool hasInputReplication() const;

private:
  std::unique_ptr<detail::SessionOptionsImpl> _impl;
  friend Compiler;
};

// Represents an attribute used in a custom operation: popart uses popart::any
// to store the different values
class PopartAttribute {
public:
  // Templating works with g++ but not clang++
  PopartAttribute(const char *name, const int64_t &value);
  PopartAttribute(const char *name, const std::vector<int64_t> &values);
  PopartAttribute(const char *name, const float &value);
  PopartAttribute(const char *name, const std::vector<float> &values);
  PopartAttribute(const char *name, const std::unique_ptr<char[]> &str);
  PopartAttribute(const char *name,
                  const std::vector<std::unique_ptr<char[]>> &strs);

  // Required for opaque pointer
  PopartAttribute(PopartAttribute &&);
  PopartAttribute &operator=(PopartAttribute &&);
  ~PopartAttribute();

  popart::any *getValue();

  const char *name() const { return _name.get(); }

private:
  // Convert a "const char *" to a std::unique_ptr char*
  static std::unique_ptr<const char[]> cStrToUP(const char *name);

  // Use a pointer to circumvent the C++ ABI problems with std::string
  std::unique_ptr<const char[]> _name;

  // Use an opaque pointer to avoid the need for popart headers
  std::unique_ptr<popart::any> _any;
};

// A class to store all the data and info required to create a constant in the
// popart builder for convenience. Internally, it is a simple wrapper to
// popart::ConstVoidData.
class PopartConstant {
public:
  PopartConstant(const PopartType &popart_type, const void *data,
                 const std::vector<std::int64_t> &shape);

  ~PopartConstant(); // Required for opaque pointer

  const popart::ConstVoidData &getPopartData() const { return *_data; }

private:
  // Use an opaque pointer to avoid the need for popart headers
  std::unique_ptr<popart::ConstVoidData> _data;
};

// A class to store a constant which is simply returned, (possibly in a tuple
// or list) and is not inserted into Popart
class HostSideConstant {
public:
  HostSideConstant(const PopartType &popart_type, void *data, size_t data_size,
                   std::vector<std::int64_t> shape);

  PopartType popartType() const { return _popart_type; }

  const std::vector<std::int64_t> &shape() const { return _shape; }

  void copyDataTo(void *ptr) const;

private:
  const PopartType _popart_type;
  std::vector<uint8_t> _data;
  std::vector<std::int64_t> _shape;
};

class Compiler {
public:
  Compiler(bool is_training, const SessionOptions &options);
  ~Compiler();
  Compiler(Compiler &&compiler);

  TensorId addInputTensor(const char *type,
                          const std::vector<std::int64_t> &dims,
                          const char *overlap = "no_overlap");

  TensorId createTensorId(const char *name);

  void setCurrentPythonCodeLocation(const char *torch_node,
                                    const char *filename, std::uint64_t line,
                                    std::uint64_t col);

#define INT_VEC std::vector<std::int64_t>
#define FLOAT_VEC std::vector<float>
#define FLOAT float
#define INT std::int64_t
#define BOOL bool
#define CHAR char
#define STRING const char *
#define STRING_VEC std::vector<const char *>
#define NONE
#define ARG(Type, Name) , Type Name
#define POPART_CONST_ARG(Name) , const PopartConstant &Name
#define HOST_SIDE_CONST_ARG(Name) , const HostSideConstant &Name
#define POPART_ATTRIB_VEC_ARG(Name)                                            \
  , std::shared_ptr<std::vector<PopartAttribute>> Name
#define BODY_ARG(Name) NONE

// Create a function decl with the given call and arguments.
#define OP_DECL(Namespace, FuncName, function, OnnxImpl, Args, BodyArgs)       \
  TensorId function(const std::vector<TensorId> &inputs Args);

// Create a function decl with the given call and arguments which returns void.
#define OP_DECL_NO_RETURN(Namespace, FuncName, function, OnnxImpl, Args,       \
                          BodyArgs)                                            \
  void function(const std::vector<TensorId> &inputs Args);

#include "SupportedOperations.inc.hpp"

#undef OP_DECL
#undef OP_DECL_NO_RETURN
#undef BODY_ARG
#undef POPART_ATTRIB_VEC_ARG
#undef HOST_SIDE_CONST_ARG
#undef POPART_CONST_ARG
#undef ARG
#undef NONE
#undef STRING_VEC
#undef STRING
#undef CHAR
#undef BOOL
#undef INT
#undef FLOAT
#undef FLOAT_VEC
#undef INT_VEC

  TensorId addInitializedInputTensor(const char *name, const char *type,
                                     const std::vector<std::int64_t> &dims,
                                     void *data);
  TensorId addInitializedInputTensor(const char *name, const char *type,
                                     const std::vector<std::int64_t> &dims,
                                     void *data, int comm_group_type,
                                     int shards, int variable_retrieval_mode);

  bool tensorIdIsValid(TensorId id) const;
  const char *tensorName(TensorId id) const;

  static const std::vector<std::int64_t> invalid_size;

  std::vector<std::int64_t> getSize(TensorId id) const;

  std::unique_ptr<char[]> getTensorDTypeString(TensorId id) const;

  bool isHostSideConstant(TensorId id) const;

  void addOutputType(OutputTypeShape type);

  // This function marks |output| as being read back from the device by the
  // host. |output_mode| determines how frequently that should happen.
  // clang-format off
  // "ALL":  Will return all popart batches.
  // "SUM": Will return the sum of all popart batches (I.E device iterations)
  // "EVERYN": Will return every N batch
  // "FINAL": Will return the last batch only
  // clang-format on
  void addOutputTensor(TensorId output,
                       PopartOutputMode output_mode = PopartOutputMode::N,
                       size_t output_return_period = 1,
                       const char *overlap = "no_overlap");

  void setUpInputOp(TensorId id, float *ptr,
                    const std::vector<std::int64_t> &dims);

  void setUpInputOp(TensorId id, std::int32_t *ptr,
                    const std::vector<std::int64_t> &dims);

  void setUpInputOp(TensorId id, bool *ptr,
                    const std::vector<std::int64_t> &dims);

  void setUpInputOp(TensorId id, std::int16_t *ptr,
                    const std::vector<std::int64_t> &dims,
                    bool float16 = false);

  // at::ScalarType::Byte
  void setUpInputOp(TensorId id, std::uint8_t *ptr,
                    const std::vector<std::int64_t> &dims);

  // at::ScalarType::Char
  void setUpInputOp(TensorId id, std::int8_t *ptr,
                    const std::vector<std::int64_t> &dims);

  // at::ScalarType::Byte
  void setUpOutputOp(TensorId id, std::uint8_t *ptr,
                     const std::vector<std::int64_t> &dims);

  // at::ScalarType::Char
  void setUpOutputOp(TensorId id, std::int8_t *ptr,
                     const std::vector<std::int64_t> &dims);

  void setUpOutputOp(TensorId id, float *ptr,
                     const std::vector<std::int64_t> &dims);

  void setUpOutputOp(TensorId id, std::int32_t *ptr,
                     const std::vector<std::int64_t> &dims);

  void setUpOutputOp(TensorId id, bool *ptr,
                     const std::vector<std::int64_t> &dims);

  void setUpOutputOp(TensorId id, std::int16_t *ptr,
                     const std::vector<std::int64_t> &dims);

  // Each std::set of tensors represents all the outputs of a node to set
  // the available memory proportion on. This function loops over the outer
  // vector, so the total number of nodes it will set the proportion on
  // will be inputs.size().
  void
  setAvailableMemoryProportion(const std::vector<std::set<TensorId>> &inputs,
                               float availableMemoryProportion);

  void setMatMulSerialization(TensorId matmul, const char *mode,
                              std::uint64_t factor,
                              std::uint64_t keep_precision);
  void clearActiveIpu();
  void setActiveIpu(std::uint64_t stage_id, std::int64_t phase_id,
                    std::int64_t ipu_id);

  void initSession(const std::vector<Optimizer> &opt,
                   const char *export_proto_filename);
  void setRngState(std::uint64_t seed,
                   const std::vector<std::uint32_t> &rng_state);

  std::vector<std::uint32_t> getRngState() const;
  std::uint64_t getRandomSeed() const;

  void saveExecutableToFile(const char *export_filename) const;
  void compileAndPrepareDevice();
  void loadEngineAndConnectStreams();
  void loadExecutableAndPrepareDevice(const char *import_filename);

  static void
  appendPoptorchMetadataToFile(const char *serialized_poptorch_metadata,
                               size_t metadata_length,
                               const char *export_filename);
  static std::vector<char>
  importPoptorchMetadataFromFile(const char *import_filename);

  TensorId addCPUCallback(const std::vector<TensorId> &inputs,
                          const CallbackMetadata &callback,
                          std::vector<PopartType> input_types,
                          std::vector<std::vector<std::size_t>> input_shapes,
                          std::vector<PopartType> output_types,
                          std::vector<std::vector<std::size_t>> output_shapes);

  void startSubgraph();

  TensorId endForLoop(std::int32_t trip_count, std::int64_t num_outputs,
                      const std::vector<TensorId> &inputs);

  void startIfBlock();

  void startElseBlock();

  TensorId endIfBlock(const TensorId &condition, std::size_t num_outputs);

  void pushNameScope(const char *name);

  void popNameScope();

  TensorId addUntypedInputTensor();
  // Write the weights into IPU memory from the pytorch tensor buffers in the
  // model.
  void copyWeightsToDevice(const std::vector<void *> &host_buffers);

  // Write the named buffers into IPU memory from the pytorch tensor buffers
  // in the model.
  void copyNamedBuffersToDevice(const std::vector<void *> &host_buffers);

  // Read the weights from IPU memory into the pytorch tensor buffers.
  void copyWeightsToHost(const std::vector<void *> &host_buffers);

  // Return the type of the given tensor.
  PopartType getPopartType(TensorId id) const;

  // Execute the compiled popart graph using poplar.
  void run();

  // Update the optimizers currently being run by the graph.
  void updateOptimizers(const std::vector<Optimizer> &optimizers);

  std::uint64_t batchPerStep() const;

  // Return the PopART batch dimensions [DeviceIterations * ReplicationFactor *
  // GradientAccumulation]
  std::uint64_t popartBatchDim() const;

  // Take the above and work out how much of it is being returned. ID must be
  // an anchor. The batch dim will be mutated depending on what the anchor is
  // returning.
  std::uint64_t popartBatchDimForAnchor(TensorId id) const;

  // Return a flat representation of the output types
  // For example: ( T0, T2, (T3, T4)) is represented as:
  // [ Tuple3, Tensor, Tensor, Tuple2, Tensor, Tensor ]
  const std::vector<OutputTypeShape> &outputTypes() const;

  // We return this as a unique char pointer to avoid leaking memory while
  // protecting the ABI boundry.
  std::unique_ptr<char[]> getPopartIR() const;

  // We return this as a unique char pointer to avoid leaking memory while
  // protecting the ABI boundry.
  std::set<std::unique_ptr<char[]>> getTensorNames() const;

  void optimizerGroup(const std::vector<TensorId> &inputs, int64_t group);

  std::vector<TensorMetadata> optimizerTensorMetadataList() const;

  void
  fillHostOptimizerStateTensorData(const std::vector<void *> &host_buffers);

  void
  writeDeviceOptimizerStateTensorData(const std::vector<void *> &host_buffers);

  std::unique_ptr<char[]> getExecutionInfo() const;

  void addMultiConvPart(const std::vector<TensorId> &inputs,
                        const std::vector<int64_t> &dilations,
                        const std::vector<int64_t> &kernel_shape,
                        const std::vector<int64_t> &pads,
                        const std::vector<int64_t> &strides);

  void setMultiConvAvailableMemoryProportions(const std::vector<double> &v);

  void setMultiConvPartialsTypes(const std::vector<int64_t> &partials_types);
  void
  setMultiConvEnableConvDithering(const std::vector<int64_t> &conv_dithering);

  void setMultiConvPlanType(int64_t plan_type);

  void setMultiConvPerConvReservedTiles(int64_t v);

  void setMultiConvCycleBackOff(double c);

  std::vector<TensorId> endMultiConv();

  void setAttribute(const char *attribute, const char *key, const char *value);
  void clearAttribute(const char *attribute, const char *key);

  void detachFromDevice();
  void attachToDevice();
  bool isAttachedToDevice() const;

  Timestamps getTimestamps() const;

  // Returns the number of cycles (on replica 0) run by the IPU for the last
  // model run.
  uint64_t getCycleCount() const;

  size_t getNumInputs() const;
  size_t getNumOutputs() const;

  // Mark named buffer as updatable
  void registerUpdatableNamedBuffer(const TensorId &id);

private:
  void assertTensorIs(PopartType dataType, TensorId id) const;

  // Make sure no overlap is specified for pipelined mode and that the output
  // mode is supported by PopART.
  void verifySettingsForOverlappedIO(PopartOutputMode output_mode);

  std::unique_ptr<detail::CompilerImpl> _impl;

  // Store the cycle account of last run, if the relevant option is enabled,
  // otherwise no_cycles
  int64_t _cycle_count;
  static constexpr int64_t no_cycles = -1;
  static constexpr const char *poptorch_opaque_name = "poptorch";
};

} // namespace popart_compiler
} // namespace poptorch


================================================
FILE: popart_compiler/include/popart_compiler/CompilerOperationMacros.inc.hpp
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
// Auto generated file, do not modify
// Run `python3 scripts/PopParse.py` to regenerate
// clang-format off

// Ops from AiGraphcoreOpset1
OP_DECL(popart, copyvarupdate, copyvarupdate, AiGraphcoreOpset1.copyvarupdate, NONE, BODY_ARG(DEBUG_CONTEXT("Copyvarupdate")))
OP_DECL(popart, batchnormalization, batchnormalization, AiGraphcoreOpset1.batchnormalization, ARG(INT,num_outputs) ARG(FLOAT,epsilon) ARG(FLOAT,momentum) , BODY_ARG(num_outputs) BODY_ARG(epsilon) BODY_ARG(momentum) BODY_ARG(DEBUG_CONTEXT("Batchnormalization")))
OP_DECL(popart, groupnormalization, groupnormalization, AiGraphcoreOpset1.groupnormalization, ARG(INT,num_groups) ARG(FLOAT,epsilon) , BODY_ARG(num_groups) BODY_ARG(epsilon) BODY_ARG(DEBUG_CONTEXT("Groupnormalization")))
OP_DECL(popart, subsample, subsample, AiGraphcoreOpset1.subsample, ARG(INT_VEC,strides) , BODY_ARG(strides) BODY_ARG(DEBUG_CONTEXT("Subsample")))
OP_DECL(popart, printtensor, printtensor, AiGraphcoreOpset1.printtensor, ARG(INT,print_gradient) ARG(STRING,title) ARG(INT,summariseThreshold) ARG(INT,edgeItems) ARG(INT,maxLineWidth) ARG(INT,digits) ARG(INT,floatFormat) ARG(CHAR,separator) ARG(CHAR,openBracket) ARG(CHAR,closeBracket) , BODY_ARG(print_gradient) BODY_ARG(DEBUG_CONTEXT("Printtensor"))BODY_ARG(title) BODY_ARG(summariseThreshold) BODY_ARG(edgeItems) BODY_ARG(maxLineWidth) BODY_ARG(digits) BODY_ARG(floatFormat) BODY_ARG(separator) BODY_ARG(openBracket) BODY_ARG(closeBracket) )
OP_DECL(popart, nop, nop, AiGraphcoreOpset1.nop, NONE, BODY_ARG(DEBUG_CONTEXT("Nop")))
OP_DECL(popart, scale, scale, AiGraphcoreOpset1.scale, ARG(FLOAT,scale) , BODY_ARG(scale) BODY_ARG(DEBUG_CONTEXT("Scale")))
OP_DECL(popart, scaledadd, scaledadd, AiGraphcoreOpset1.scaledadd, ARG(FLOAT,scale0) ARG(FLOAT,scale1) , BODY_ARG(scale0) BODY_ARG(scale1) BODY_ARG(DEBUG_CONTEXT("Scaledadd")))
OP_DECL(popart, lstm, lstm, AiGraphcoreOpset1.lstm, ARG(INT,outputFullSequence) , BODY_ARG(outputFullSequence) BODY_ARG(DEBUG_CONTEXT("Lstm")))
OP_DECL(popart, gelu, gelu, AiGraphcoreOpset1.gelu, NONE, BODY_ARG(DEBUG_CONTEXT("Gelu")))
OP_DECL(popart, geluerf, geluerf, AiGraphcoreOpset1.geluerf, NONE, BODY_ARG(DEBUG_CONTEXT("GeluErf")))
OP_DECL(popart, detach, detach, AiGraphcoreOpset1.detach, NONE, BODY_ARG(DEBUG_CONTEXT("Detach")))
OP_DECL(popart, depthtospace, depthtospace, AiGraphcoreOpset1.depthtospace, ARG(INT,blocksize) ARG(STRING,mode) , BODY_ARG(blocksize) BODY_ARG(mode) BODY_ARG(DEBUG_CONTEXT("Depthtospace")))
OP_DECL(popart, round, round, AiGraphcoreOpset1.round, NONE, BODY_ARG(DEBUG_CONTEXT("Round")))
OP_DECL(popart, dynamicslice, dynamicslice, AiGraphcoreOpset1.dynamicslice, ARG(INT_VEC,axes) ARG(INT_VEC,sizes) ARG(INT,noOverlap) , BODY_ARG(axes) BODY_ARG(sizes) BODY_ARG(noOverlap) BODY_ARG(DEBUG_CONTEXT("Dynamicslice")))
OP_DECL(popart, dynamicupdate, dynamicupdate, AiGraphcoreOpset1.dynamicupdate, ARG(INT_VEC,axes) ARG(INT_VEC,sizes) ARG(INT,noOverlap) , BODY_ARG(axes) BODY_ARG(sizes) BODY_ARG(noOverlap) BODY_ARG(DEBUG_CONTEXT("Dynamicupdate")))
OP_DECL(popart, dynamiczero, dynamiczero, AiGraphcoreOpset1.dynamiczero, ARG(INT_VEC,axes) ARG(INT_VEC,sizes) , BODY_ARG(axes) BODY_ARG(sizes) BODY_ARG(DEBUG_CONTEXT("Dynamiczero")))
OP_DECL(popart, dynamicadd, dynamicadd, AiGraphcoreOpset1.dynamicadd, ARG(INT_VEC,axes) ARG(INT_VEC,sizes) , BODY_ARG(axes) BODY_ARG(sizes) BODY_ARG(DEBUG_CONTEXT("Dynamicadd")))
OP_DECL(popart, sequenceslice, sequenceslice, AiGraphcoreOpset1.sequenceslice, ARG(INT,zeroUnused) , BODY_ARG(zeroUnused) BODY_ARG(DEBUG_CONTEXT("Sequenceslice")))
OP_DECL(popart, l1loss, l1loss, AiGraphcoreOpset1.l1loss, ARG(FLOAT,lambda) ARG(INT,reduction) , BODY_ARG(lambda) BODY_ARG(static_cast<popart::ReductionType>(reduction)) BODY_ARG(DEBUG_CONTEXT("L1loss")))
OP_DECL(popart, nllloss, nllloss, AiGraphcoreOpset1.nllloss, ARG(INT,reduction) ARG(INT,ignoreIndex) ARG(INT,inputIsLogProbability) , BODY_ARG(static_cast<popart::ReductionType>(reduction)) BODY_ARG(ignoreIndex) BODY_ARG(inputIsLogProbability) BODY_ARG(DEBUG_CONTEXT("Nllloss")))
OP_DECL(popart, identityloss, identityloss, AiGraphcoreOpset1.identityloss, ARG(INT,reduction) , BODY_ARG(static_cast<popart::ReductionType>(reduction)) BODY_ARG(DEBUG_CONTEXT("Identityloss")))
OP_DECL(popart, _ctcloss, _ctcloss, AiGraphcoreOpset1._ctcloss, ARG(INT,reduction) ARG(INT,blank) ARG(STRING,outDataType) ARG(INT,zeroInfinity) , BODY_ARG(static_cast<popart::ReductionType>(reduction)) BODY_ARG(blank) BODY_ARG(outDataType) BODY_ARG(zeroInfinity) BODY_ARG(DEBUG_CONTEXT("_ctcloss")))
OP_DECL(popart, ctcbeamsearchdecoder, ctcbeamsearchdecoder, AiGraphcoreOpset1.ctcbeamsearchdecoder, ARG(INT,blank) ARG(INT,beamWidth) ARG(INT,topPaths) , BODY_ARG(blank) BODY_ARG(beamWidth) BODY_ARG(topPaths) BODY_ARG(DEBUG_CONTEXT("Ctcbeamsearchdecoder")))
OP_DECL(popart, shapeddropout, shapeddropout, AiGraphcoreOpset1.shapeddropout, ARG(INT_VEC,shape) ARG(FLOAT,ratio) , BODY_ARG(shape) BODY_ARG(ratio) BODY_ARG(DEBUG_CONTEXT("Shapeddropout")))
OP_DECL(popart, atan2, atan2, AiGraphcoreOpset1.atan2, NONE, BODY_ARG(DEBUG_CONTEXT("Atan2")))
OP_DECL(popart, expm1, expm1, AiGraphcoreOpset1.expm1, NONE, BODY_ARG(DEBUG_CONTEXT("Expm1")))
OP_DECL(popart, log1p, log1p, AiGraphcoreOpset1.log1p, NONE, BODY_ARG(DEBUG_CONTEXT("Log1p")))
OP_DECL(popart, fmod, fmod, AiGraphcoreOpset1.fmod, NONE, BODY_ARG(DEBUG_CONTEXT("Fmod")))
OP_DECL(popart, remainder, remainder, AiGraphcoreOpset1.remainder, NONE, BODY_ARG(DEBUG_CONTEXT("Remainder")))
OP_DECL(popart, reverse, reverse, AiGraphcoreOpset1.reverse, ARG(INT_VEC,dimensions) , BODY_ARG(dimensions) BODY_ARG(DEBUG_CONTEXT("Reverse")))
OP_DECL(popart, slice, slice, AiGraphcoreOpset1.slice, ARG(INT_VEC,ends) ARG(INT_VEC,starts) ARG(INT_VEC,axes) , BODY_ARG(ends) BODY_ARG(starts) BODY_ARG(axes) BODY_ARG(DEBUG_CONTEXT("Slice")))
OP_DECL(popart, bitwisenot, bitwisenot, AiGraphcoreOpset1.bitwisenot, NONE, BODY_ARG(DEBUG_CONTEXT("Bitwisenot")))
OP_DECL(popart, bitwiseand, bitwiseand, AiGraphcoreOpset1.bitwiseand, NONE, BODY_ARG(DEBUG_CONTEXT("Bitwiseand")))
OP_DECL(popart, bitwiseor, bitwiseor, AiGraphcoreOpset1.bitwiseor, NONE, BODY_ARG(DEBUG_CONTEXT("Bitwiseor")))
OP_DECL(popart, bitwisexor, bitwisexor, AiGraphcoreOpset1.bitwisexor, NONE, BODY_ARG(DEBUG_CONTEXT("Bitwisexor")))
OP_DECL(popart, bitwisexnor, bitwisexnor, AiGraphcoreOpset1.bitwisexnor, NONE, BODY_ARG(DEBUG_CONTEXT("Bitwisexnor")))
OP_DECL(popart, reducemedian, reducemedian, AiGraphcoreOpset1.reducemedian, ARG(INT_VEC,axes) ARG(INT,keepdims) , BODY_ARG(axes) BODY_ARG(keepdims) BODY_ARG(DEBUG_CONTEXT("Reducemedian")))
OP_DECL(popart, scatterreduce, scatterreduce, AiGraphcoreOpset1.scatterreduce, ARG(INT,axis_size) ARG(INT,axis) ARG(INT,reduction) ARG(INT, enable_index_broadcast), BODY_ARG(axis_size) BODY_ARG(axis) BODY_ARG(static_cast<popart::ScatterReduction>(reduction)) BODY_ARG(enable_index_broadcast) BODY_ARG(DEBUG_CONTEXT("Scatterreduce")))
OP_DECL(popart, groupedscatterreduce, groupedscatterreduce, AiGraphcoreOpset1.groupedscatterreduce, ARG(INT,axis_size) ARG(INT,axis) ARG(INT,reduction) ARG(INT,group_size) ARG(INT, enable_index_broadcast), BODY_ARG(axis_size) BODY_ARG(axis) BODY_ARG(static_cast<popart::ScatterReduction>(reduction)) BODY_ARG(group_size) BODY_ARG(enable_index_broadcast) BODY_ARG(DEBUG_CONTEXT("Scatterreduce")))
OP_DECL(popart, groupedgather, groupedgather, AiGraphcoreOpset1.groupedgather, ARG(INT,axis) ARG(INT,group_size) , BODY_ARG(axis)  BODY_ARG(group_size) BODY_ARG(DEBUG_CONTEXT("GroupedGather")))
OP_DECL(popart, swish, swish, AiGraphcoreOpset1.swish, NONE, BODY_ARG(DEBUG_CONTEXT("Swish")))
OP_DECL(popart, bucketize, bucketize, AiGraphcoreOpset1.bucketize, ARG(INT,right) , BODY_ARG(right) BODY_ARG(DEBUG_CONTEXT("Bucketize")))
OP_DECL(popart, sort, sort, AiGraphcoreOpset1.sort,  ARG(INT,axis) ARG(INT,descending) ARG(INT,stable) , BODY_ARG(axis)  BODY_ARG(descending)  BODY_ARG(stable) BODY_ARG(DEBUG_CONTEXT("Sort")))
OP_DECL(popart, nearbyint, nearbyint, AiGraphcoreOpset1.nearbyint, NONE, BODY_ARG(DEBUG_CONTEXT("NearbyInt")))
OP_DECL(popart, splinebasis, splinebasis, AiGraphcoreOpset1.splinebasis, ARG(INT,degree) , BODY_ARG(degree) BODY_ARG(DEBUG_CONTEXT("SplineBasis")))
OP_DECL(popart, splineweighting, splineweighting, AiGraphcoreOpset1.splineweighting, NONE, BODY_ARG(DEBUG_CONTEXT("SplineWeighting")))

// Ops from AiOnnxOpset11
OP_DECL(popart, topk, topk, AiOnnxOpset11.topk, ARG(INT,axis) ARG(INT,largest) ARG(INT,sorted), BODY_ARG(axis) BODY_ARG(largest) BODY_ARG(sorted) BODY_ARG(DEBUG_CONTEXT("Topk")))

// Ops from AiOnnxOpset11
OP_DECL(popart, averagepool, averagepool, AiOnnxOpset11.averagepool, ARG(INT_VEC,kernel_shape) ARG(INT,ceil_mode) ARG(INT,count_include_pad) ARG(INT_VEC,pads) ARG(INT_VEC,strides) , BODY_ARG(kernel_shape) BODY_ARG(ceil_mode) BODY_ARG(count_include_pad) BODY_ARG(pads) BODY_ARG(strides) BODY_ARG(DEBUG_CONTEXT("Averagepool")))
OP_DECL(popart, convinteger, convinteger, AiOnnxOpset11.convinteger, ARG(INT_VEC,dilations) ARG(INT,group) ARG(INT_VEC,kernel_shape) ARG(INT_VEC,pads) ARG(INT_VEC,strides) , BODY_ARG(dilations) BODY_ARG(group) BODY_ARG(kernel_shape) BODY_ARG(pads) BODY_ARG(strides) BODY_ARG(DEBUG_CONTEXT("Convinteger")))
OP_DECL(popart, dequantizelinear, dequantizelinear, AiOnnxOpset11.dequantizelinear, NONE, BODY_ARG(DEBUG_CONTEXT("Dequantizelinear")))
OP_DECL(popart, dropout, dropout, AiOnnxOpset11.dropout, ARG(INT,num_outputs) ARG(FLOAT,ratio) , BODY_ARG(num_outputs) BODY_ARG(ratio) BODY_ARG(DEBUG_CONTEXT("Dropout")))
OP_DECL(popart, isinf, isinf, AiOnnxOpset11.isinf, ARG(INT,detect_negative) ARG(INT,detect_positive) , BODY_ARG(detect_negative) BODY_ARG(detect_positive) BODY_ARG(DEBUG_CONTEXT("Isinf")))
OP_DECL(popart, matmulinteger, matmulinteger, AiOnnxOpset11.matmulinteger, NONE, BODY_ARG(DEBUG_CONTEXT("Matmulinteger")))
OP_DECL(popart, maxpool, maxpool, AiOnnxOpset11.maxpool, ARG(INT,num_outputs) ARG(INT_VEC,kernel_shape) ARG(INT,ceil_mode) ARG(INT_VEC,dilations) ARG(INT_VEC,pads) ARG(INT,storage_order) ARG(INT_VEC,strides) , BODY_ARG(num_outputs) BODY_ARG(kernel_shape) BODY_ARG(ceil_mode) BODY_ARG(dilations) BODY_ARG(pads) BODY_ARG(storage_order) BODY_ARG(strides) BODY_ARG(DEBUG_CONTEXT("Maxpool")))
OP_DECL(popart, mod, mod, AiOnnxOpset11.mod, ARG(INT,fmod) , BODY_ARG(fmod) BODY_ARG(DEBUG_CONTEXT("Mod")))
OP_DECL(popart, nonmaxsuppression, nonmaxsuppression, AiOnnxOpset11.nonmaxsuppression, ARG(INT,center_point_box) , BODY_ARG(center_point_box) BODY_ARG(DEBUG_CONTEXT("Nonmaxsuppression")))
OP_DECL(popart, qlinearconv, qlinearconv, AiOnnxOpset11.qlinearconv, ARG(INT_VEC,dilations) ARG(INT,group) ARG(INT_VEC,kernel_shape) ARG(INT_VEC,pads) ARG(INT_VEC,strides) , BODY_ARG(dilations) BODY_ARG(group) BODY_ARG(kernel_shape) BODY_ARG(pads) BODY_ARG(strides) BODY_ARG(DEBUG_CONTEXT("Qlinearconv")))
OP_DECL(popart, qlinearmatmul, qlinearmatmul, AiOnnxOpset11.qlinearmatmul, NONE, BODY_ARG(DEBUG_CONTEXT("Qlinearmatmul")))
OP_DECL(popart, quantizelinear, quantizelinear, AiOnnxOpset11.quantizelinear, NONE, BODY_ARG(DEBUG_CONTEXT("Quantizelinear")))
OP_DECL(popart, resize, resize, AiOnnxOpset11.resize, ARG(STRING,coordinate_transformation_mode) ARG(FLOAT,cubic_coeff_a) ARG(INT,exclude_outside) ARG(FLOAT,extrapolation_value) ARG(STRING,mode) ARG(STRING,nearest_mode), BODY_ARG(coordinate_transformation_mode) BODY_ARG(cubic_coeff_a) BODY_ARG(exclude_outside) BODY_ARG(extrapolation_value) BODY_ARG(mode) BODY_ARG(nearest_mode) BODY_ARG(DEBUG_CONTEXT("Resize")))
OP_DECL(popart, reversesequence, reversesequence, AiOnnxOpset11.reversesequence, ARG(INT,batch_axis) ARG(INT,time_axis) , BODY_ARG(batch_axis) BODY_ARG(time_axis) BODY_ARG(DEBUG_CONTEXT("Reversesequence")))
OP_DECL(popart, roialign, roialign, AiOnnxOpset11.roialign, ARG(STRING,mode) ARG(INT,output_height) ARG(INT,output_width) ARG(INT,sampling_ratio) ARG(FLOAT,spatial_scale) , BODY_ARG(mode) BODY_ARG(output_height) BODY_ARG(output_width) BODY_ARG(sampling_ratio) BODY_ARG(spatial_scale) BODY_ARG(DEBUG_CONTEXT("Roialign")))
OP_DECL(popart, thresholdedrelu, thresholdedrelu, AiOnnxOpset11.thresholdedrelu, ARG(FLOAT,alpha) , BODY_ARG(alpha) BODY_ARG(DEBUG_CONTEXT("Thresholdedrelu")))
OP_DECL(popart, upsample, upsample, AiOnnxOpset11.upsample, ARG(STRING,mode) , BODY_ARG(mode) BODY_ARG(DEBUG_CONTEXT("Upsample")))
// Ops from AiOnnxOpset9
OP_DECL(popart, acosh, acosh, AiOnnxOpset11.acosh, NONE, BODY_ARG(DEBUG_CONTEXT("Acosh")))
OP_DECL(popart, asinh, asinh, AiOnnxOpset11.asinh, NONE, BODY_ARG(DEBUG_CONTEXT("Asinh")))
OP_DECL(popart, atanh, atanh, AiOnnxOpset11.atanh, NONE, BODY_ARG(DEBUG_CONTEXT("Atanh")))
OP_DECL(popart, cast, cast, AiOnnxOpset11.cast, ARG(STRING,to) , BODY_ARG(to) BODY_ARG(DEBUG_CONTEXT("Cast")))
OP_DECL(popart, compress, compress, AiOnnxOpset11.compress, ARG(INT,axis) , BODY_ARG(axis) BODY_ARG(DEBUG_CONTEXT("Compress")))
OP_DECL(popart, cosh, cosh, AiOnnxOpset11.cosh, NONE, BODY_ARG(DEBUG_CONTEXT("Cosh")))
OP_DECL(popart, erf, erf, AiOnnxOpset11.erf, NONE, BODY_ARG(DEBUG_CONTEXT("Erf")))
OP_DECL(popart, eyelike, eyelike, AiOnnxOpset11.eyelike, ARG(INT,dtype) ARG(INT,k) , BODY_ARG(dtype) BODY_ARG(k) BODY_ARG(DEBUG_CONTEXT("Eyelike")))
OP_DECL(popart, flatten, flatten, AiOnnxOpset11.flatten, ARG(INT,axis) , BODY_ARG(axis) BODY_ARG(DEBUG_CONTEXT("Flatten")))
OP_DECL(popart, gemm, gemm, AiOnnxOpset11.gemm, ARG(FLOAT,alpha) ARG(FLOAT,beta) ARG(INT,transA) ARG(INT,transB) , BODY_ARG(alpha) BODY_ARG(beta) BODY_ARG(transA) BODY_ARG(transB) BODY_ARG(DEBUG_CONTEXT("Gemm")))
OP_DECL(popart, greater, greater, AiOnnxOpset11.greater, NONE, BODY_ARG(DEBUG_CONTEXT("Greater")))
OP_DECL(popart, isnan, isnan, AiOnnxOpset11.isnan, NONE, BODY_ARG(DEBUG_CONTEXT("Isnan")))
OP_DECL(popart, less, less, AiOnnxOpset11.less, NONE, BODY_ARG(DEBUG_CONTEXT("Less")))
OP_DECL(popart, matmul, matmul, AiOnnxOpset11.matmul, NONE, BODY_ARG(DEBUG_CONTEXT("Matmul")))
OP_DECL(popart, maxunpool, maxunpool, AiOnnxOpset11.maxunpool, ARG(INT_VEC,kernel_shape) ARG(INT_VEC,pads) ARG(INT_VEC,strides) , BODY_ARG(kernel_shape) BODY_ARG(pads) BODY_ARG(strides) BODY_ARG(DEBUG_CONTEXT("Maxunpool")))
OP_DECL(popart, meanvariancenormalization, meanvariancenormalization, AiOnnxOpset11.meanvariancenormalization, ARG(INT_VEC,axes) , BODY_ARG(axes) BODY_ARG(DEBUG_CONTEXT("Meanvariancenormalization")))
OP_DECL(popart, nonzero, nonzero, AiOnnxOpset11.nonzero, NONE, BODY_ARG(DEBUG_CONTEXT("Nonzero")))
OP_DECL(popart, onehot, onehot, AiOnnxOpset11.onehot, ARG(INT,axis) , BODY_ARG(axis) BODY_ARG(DEBUG_CONTEXT("Onehot")))
OP_DECL(popart, scatter, scatter, AiOnnxOpset11.scatter, ARG(INT,axis) , BODY_ARG(axis) BODY_ARG(DEBUG_CONTEXT("Scatter")))
OP_DECL(popart, scatterelements, scatterelements, AiOnnxOpset11.scatterelements, ARG(INT,axis) , BODY_ARG(axis) BODY_ARG(DEBUG_CONTEXT("ScatterElements")))
OP_DECL(popart, shrink, shrink, AiOnnxOpset11.shrink, ARG(FLOAT,bias) ARG(FLOAT,lambd) , BODY_ARG(bias) BODY_ARG(lambd) BODY_ARG(DEBUG_CONTEXT("Shrink")))
OP_DECL(popart, sign, sign, AiOnnxOpset11.sign, NONE, BODY_ARG(DEBUG_CONTEXT("Sign")))
OP_DECL(popart, sinh, sinh, AiOnnxOpset11.sinh, NONE, BODY_ARG(DEBUG_CONTEXT("Sinh")))
OP_DECL(popart, tfidfvectorizer, tfidfvectorizer, AiOnnxOpset11.tfidfvectorizer, ARG(INT,max_gram_length) ARG(INT,max_skip_count) ARG(INT,min_gram_length) ARG(STRING,mode) ARG(INT_VEC,ngram_counts) ARG(INT_VEC,ngram_indexes) ARG(INT_VEC,pool_int64s) ARG(STRING_VEC,pool_strings) ARG(FLOAT_VEC,weights) , BODY_ARG(max_gram_length) BODY_ARG(max_skip_count) BODY_ARG(min_gram_length) BODY_ARG(mode) BODY_ARG(ngram_counts) BODY_ARG(ngram_indexes) BODY_ARG(pool_int64s) BODY_ARG(pool_strings) BODY_ARG(weights) BODY_ARG(DEBUG_CONTEXT("Tfidfvectorizer")))
OP_DECL(popart, where, where, AiOnnxOpset11.where, NONE, BODY_ARG(DEBUG_CONTEXT("Where")))
// Ops from AiOnnxOpset8
OP_DECL(popart, expand, expand, AiOnnxOpset11.expand, NONE, BODY_ARG(DEBUG_CONTEXT("Expand")))
OP_DECL(popart, max, max, AiOnnxOpset11.max, NONE, BODY_ARG(DEBUG_CONTEXT("Max")))
OP_DECL(popart, mean, mean, AiOnnxOpset11.mean, NONE, BODY_ARG(DEBUG_CONTEXT("Mean")))
OP_DECL(popart, min, min, AiOnnxOpset11.min, NONE, BODY_ARG(DEBUG_CONTEXT("Min")))
OP_DECL(popart, sum, sum, AiOnnxOpset11.sum, NONE, BODY_ARG(DEBUG_CONTEXT("Sum")))
// Ops from AiOnnxOpset7
OP_DECL(popart, acos, acos, AiOnnxOpset11.acos, NONE, BODY_ARG(DEBUG_CONTEXT("Acos")))
OP_DECL(popart, add, add, AiOnnxOpset11.add, NONE, BODY_ARG(DEBUG_CONTEXT("Add")))
OP_DECL(popart, logical_and, logical_and, AiOnnxOpset11.logical_and, NONE, BODY_ARG(DEBUG_CONTEXT("Logical_and")))
OP_DECL(popart, asin, asin, AiOnnxOpset11.asin, NONE, BODY_ARG(DEBUG_CONTEXT("Asin")))
OP_DECL(popart, atan, atan, AiOnnxOpset11.atan, NONE, BODY_ARG(DEBUG_CONTEXT("Atan")))
OP_DECL(popart, cos, cos, AiOnnxOpset11.cos, NONE, BODY_ARG(DEBUG_CONTEXT("Cos")))
OP_DECL(popart, div, div, AiOnnxOpset11.div, NONE, BODY_ARG(DEBUG_CONTEXT("Div")))
OP_DECL(popart, equal, equal, AiOnnxOpset11.equal, NONE, BODY_ARG(DEBUG_CONTEXT("Equal")))
OP_DECL(popart, mul, mul, AiOnnxOpset11.mul, NONE, BODY_ARG(DEBUG_CONTEXT("Mul")))
OP_DECL(popart, multinomial, multinomial, AiOnnxOpset11.multinomial, ARG(INT,dtype) ARG(INT,sample_size) ARG(FLOAT,seed) , BODY_ARG(dtype) BODY_ARG(sample_size) BODY_ARG(seed) BODY_ARG(DEBUG_CONTEXT("Multinomial")))
OP_DECL(popart, logical_or, logical_or, AiOnnxOpset11.logical_or, NONE, BODY_ARG(DEBUG_CONTEXT("Logical_or")))
OP_DECL(popart, pow, pow, AiOnnxOpset11.pow, NONE, BODY_ARG(DEBUG_CONTEXT("Pow")))
OP_DECL(popart, sin, sin, AiOnnxOpset11.sin, NONE, BODY_ARG(DEBUG_CONTEXT("Sin")))
OP_DECL(popart, sub, sub, AiOnnxOpset11.sub, NONE, BODY_ARG(DEBUG_CONTEXT("Sub")))
OP_DECL(popart, tan, tan, AiOnnxOpset11.tan, NONE, BODY_ARG(DEBUG_CONTEXT("Tan")))
OP_DECL(popart, logical_xor, logical_xor, AiOnnxOpset11.logical_xor, NONE, BODY_ARG(DEBUG_CONTEXT("Logical_xor")))
// Ops from AiOnnxOpset6
OP_DECL(popart, abs, abs, AiOnnxOpset11.abs, NONE, BODY_ARG(DEBUG_CONTEXT("Abs")))
OP_DECL(popart, argmax, argmax, AiOnnxOpset11.argmax, ARG(INT,axis) ARG(INT,keepdims) , BODY_ARG(axis) BODY_ARG(keepdims) BODY_ARG(DEBUG_CONTEXT("Argmax")))
OP_DECL(popart, argmin, argmin, AiOnnxOpset11.argmin, ARG(INT,axis) ARG(INT,keepdims) , BODY_ARG(axis) BODY_ARG(keepdims) BODY_ARG(DEBUG_CONTEXT("Argmin")))
OP_DECL(popart, ceil, ceil, AiOnnxOpset11.ceil, NONE, BODY_ARG(DEBUG_CONTEXT("Ceil")))
OP_DECL(popart, clip, clip, AiOnnxOpset11.clip, NONE, BODY_ARG(DEBUG_CONTEXT("Clip")))
OP_DECL(popart, concat, concat, AiOnnxOpset11.concat, ARG(INT,axis) , BODY_ARG(axis) BODY_ARG(DEBUG_CONTEXT("Concat")))
OP_DECL(popart, conv, conv, AiOnnxOpset11.conv, ARG(INT_VEC,dilations) ARG(INT,group) ARG(INT_VEC,kernel_shape) ARG(INT_VEC,pads) ARG(INT_VEC,strides) , BODY_ARG(dilations) BODY_ARG(group) BODY_ARG(kernel_shape) BODY_ARG(pads) BODY_ARG(strides) BODY_ARG(DEBUG_CONTEXT("Conv")))
OP_DECL(popart, convtranspose, convtranspose, AiOnnxOpset11.convtranspose, ARG(INT_VEC,dilations) ARG(INT,group) ARG(INT_VEC,kernel_shape) ARG(INT_VEC,output_padding) ARG(INT_VEC,output_shape) ARG(INT_VEC,pads) ARG(INT_VEC,strides) , BODY_ARG(dilations) BODY_ARG(group) BODY_ARG(kernel_shape) BODY_ARG(output_padding) BODY_ARG(output_shape) BODY_ARG(pads) BODY_ARG(strides) BODY_ARG(DEBUG_CONTEXT("Convtranspose")))
OP_DECL(popart, elu, elu, AiOnnxOpset11.elu, ARG(FLOAT,alpha) , BODY_ARG(alpha) BODY_ARG(DEBUG_CONTEXT("Elu")))
OP_DECL(popart, exp, exp, AiOnnxOpset11.exp, NONE, BODY_ARG(DEBUG_CONTEXT("Exp")))
OP_DECL(popart, floor, floor, AiOnnxOpset11.floor, NONE, BODY_ARG(DEBUG_CONTEXT("Floor")))
OP_DECL(popart, gather, gather, AiOnnxOpset11.gather, ARG(INT,axis) , BODY_ARG(axis) BODY_ARG(DEBUG_CONTEXT("Gather")))
OP_DECL(popart, globalaveragepool, globalaveragepool, AiOnnxOpset11.globalaveragepool, NONE, BODY_ARG(DEBUG_CONTEXT("Globalaveragepool")))
OP_DECL(popart, globallppool, globallppool, AiOnnxOpset11.globallppool, ARG(INT,p) , BODY_ARG(p) BODY_ARG(DEBUG_CONTEXT("Globallppool")))
OP_DECL(popart, globalmaxpool, globalmaxpool, AiOnnxOpset11.globalmaxpool, NONE, BODY_ARG(DEBUG_CONTEXT("Globalmaxpool")))
OP_DECL(popart, hardsigmoid, hardsigmoid, AiOnnxOpset11.hardsigmoid, ARG(FLOAT,alpha) ARG(FLOAT,beta) , BODY_ARG(alpha) BODY_ARG(beta) BODY_ARG(DEBUG_CONTEXT("Hardsigmoid")))
OP_DECL(popart, hardmax, hardmax, AiOnnxOpset11.hardmax, ARG(INT,axis) , BODY_ARG(axis) BODY_ARG(DEBUG_CONTEXT("Hardmax")))
OP_DECL(popart, identity, identity, AiOnnxOpset11.identity, NONE, BODY_ARG(DEBUG_CONTEXT("Identity")))
OP_DECL(popart, instancenormalization, instancenormalization, AiOnnxOpset11.instancenormalization, ARG(FLOAT,epsilon) , BODY_ARG(epsilon) BODY_ARG(DEBUG_CONTEXT("Instancenormalization")))
OP_DECL(popart, lrn, lrn, AiOnnxOpset11.lrn, ARG(INT,size) ARG(FLOAT,alpha) ARG(FLOAT,beta) ARG(FLOAT,bias) , BODY_ARG(size) BODY_ARG(alpha) BODY_ARG(beta) BODY_ARG(bias) BODY_ARG(DEBUG_CONTEXT("Lrn")))
OP_DECL(popart, leakyrelu, leakyrelu, AiOnnxOpset11.leakyrelu, ARG(FLOAT,alpha) , BODY_ARG(alpha) BODY_ARG(DEBUG_CONTEXT("Leakyrelu")))
OP_DECL(popart, log, log, AiOnnxOpset11.log, NONE, BODY_ARG(DEBUG_CONTEXT("Log")))
OP_DECL(popart, logsoftmax, logsoftmax, AiOnnxOpset11.logsoftmax, ARG(INT,axis) , BODY_ARG(axis) BODY_ARG(DEBUG_CONTEXT("Logsoftmax")))
OP_DECL(popart, lpnormalization, lpnormalization, AiOnnxOpset11.lpnormalization, ARG(INT,axis) ARG(INT,p) , BODY_ARG(axis) BODY_ARG(p) BODY_ARG(DEBUG_CONTEXT("Lpnormalization")))
OP_DECL(popart, lppool, lppool, AiOnnxOpset11.lppool, ARG(INT_VEC,kernel_shape) ARG(INT,p) ARG(INT_VEC,pads) ARG(INT_VEC,strides) , BODY_ARG(kernel_shape) BODY_ARG(p) BODY_ARG(pads) BODY_ARG(strides) BODY_ARG(DEBUG_CONTEXT("Lppool")))
OP_DECL(popart, maxroipool, maxroipool, AiOnnxOpset11.maxroipool, ARG(INT_VEC,pooled_shape) ARG(FLOAT,spatial_scale) , BODY_ARG(pooled_shape) BODY_ARG(spatial_scale) BODY_ARG(DEBUG_CONTEXT("Maxroipool")))
OP_DECL(popart, neg, neg, AiOnnxOpset11.neg, NONE, BODY_ARG(DEBUG_CONTEXT("Neg")))
OP_DECL(popart, logical_not, logical_not, AiOnnxOpset11.logical_not, NONE, BODY_ARG(DEBUG_CONTEXT("Logical_not")))
OP_DECL(popart, pad, pad, AiOnnxOpset11.pad, ARG(STRING,mode), BODY_ARG(mode) BODY_ARG(DEBUG_CONTEXT("Pad")))
OP_DECL(popart, randomnormallike, randomnormallike, AiOnnxOpset11.randomnormallike, ARG(INT,dtype) ARG(FLOAT,mean) ARG(FLOAT,scale) ARG(FLOAT,seed) , BODY_ARG(dtype) BODY_ARG(mean) BODY_ARG(scale) BODY_ARG(seed) BODY_ARG(DEBUG_CONTEXT("Randomnormallike")))
OP_DECL(popart, randomuniformlike, randomuniformlike, AiOnnxOpset11.randomuniformlike, ARG(INT,dtype) ARG(FLOAT,high) ARG(FLOAT,low) ARG(FLOAT,seed) , BODY_ARG(dtype) BODY_ARG(high) BODY_ARG(low) BODY_ARG(seed) BODY_ARG(DEBUG_CONTEXT("Randomuniformlike")))
OP_DECL(popart, reciprocal, reciprocal, AiOnnxOpset11.reciprocal, NONE, BODY_ARG(DEBUG_CONTEXT("Reciprocal")))
OP_DECL(popart, reducel1, reducel1, AiOnnxOpset11.reducel1, ARG(INT_VEC,axes) ARG(INT,keepdims) , BODY_ARG(axes) BODY_ARG(keepdims) BODY_ARG(DEBUG_CONTEXT("Reducel1")))
OP_DECL(popart, reducel2, reducel2, AiOnnxOpset11.reducel2, ARG(INT_VEC,axes) ARG(INT,keepdims) , BODY_ARG(axes) BODY_ARG(keepdims) BODY_ARG(DEBUG_CONTEXT("Reducel2")))
OP_DECL(popart, reducelogsum, reducelogsum, AiOnnxOpset11.reducelogsum, ARG(INT_VEC,axes) ARG(INT,keepdims) , BODY_ARG(axes) BODY_ARG(keepdims) BODY_ARG(DEBUG_CONTEXT("Reducelogsum")))
OP_DECL(popart, reducelogsumexp, reducelogsumexp, AiOnnxOpset11.reducelogsumexp, ARG(INT_VEC,axes) ARG(INT,keepdims) , BODY_ARG(axes) BODY_ARG(keepdims) BODY_ARG(DEBUG_CONTEXT("Reducelogsumexp")))
OP_DECL(popart, reducemax, reducemax, AiOnnxOpset11.reducemax, ARG(INT_VEC,axes) ARG(INT,keepdims) , BODY_ARG(axes) BODY_ARG(keepdims) BODY_ARG(DEBUG_CONTEXT("Reducemax")))
OP_DECL(popart, reducemean, reducemean, AiOnnxOpset11.reducemean, ARG(INT_VEC,axes) ARG(INT,keepdims) , BODY_ARG(axes) BODY_ARG(keepdims) BODY_ARG(DEBUG_CONTEXT("Reducemean")))
OP_DECL(popart, reducemin, reducemin, AiOnnxOpset11.reducemin, ARG(INT_VEC,axes) ARG(INT,keepdims) , BODY_ARG(axes) BODY_ARG(keepdims) BODY_ARG(DEBUG_CONTEXT("Reducemin")))
OP_DECL(popart, reduceprod, reduceprod, AiOnnxOpset11.reduceprod, ARG(INT_VEC,axes) ARG(INT,keepdims) , BODY_ARG(axes) BODY_ARG(keepdims) BODY_ARG(DEBUG_CONTEXT("Reduceprod")))
OP_DECL(popart, reducesum, reducesum, AiOnnxOpset11.reducesum, ARG(INT_VEC,axes) ARG(INT,keepdims) , BODY_ARG(axes) BODY_ARG(keepdims) BODY_ARG(DEBUG_CONTEXT("Reducesum")))
OP_DECL(popart, reducesumsquare, reducesumsquare, AiOnnxOpset11.reducesumsquare, ARG(INT_VEC,axes) ARG(INT,keepdims) , BODY_ARG(axes) BODY_ARG(keepdims) BODY_ARG(DEBUG_CONTEXT("Reducesumsquare")))
OP_DECL(popart, relu, relu, AiOnnxOpset11.relu, NONE, BODY_ARG(DEBUG_CONTEXT("Relu")))
OP_DECL(popart, selu, selu, AiOnnxOpset11.selu, ARG(FLOAT,alpha) ARG(FLOAT,gamma) , BODY_ARG(alpha) BODY_ARG(gamma) BODY_ARG(DEBUG_CONTEXT("Selu")))
OP_DECL(popart, shape, shape, AiOnnxOpset11.shape, NONE, BODY_ARG(DEBUG_CONTEXT("Shape")))
OP_DECL(popart, sigmoid, sigmoid, AiOnnxOpset11.sigmoid, NONE, BODY_ARG(DEBUG_CONTEXT("Sigmoid")))
OP_DECL(popart, size, size, AiOnnxOpset11.size, NONE, BODY_ARG(DEBUG_CONTEXT("Size")))
OP_DECL(popart, softmax, softmax, AiOnnxOpset11.softmax, ARG(INT,axis) , BODY_ARG(axis) BODY_ARG(DEBUG_CONTEXT("Softmax")))
OP_DECL(popart, softplus, softplus, AiOnnxOpset11.softplus, NONE, BODY_ARG(DEBUG_CONTEXT("Softplus")))
OP_DECL(popart, softsign, softsign, AiOnnxOpset11.softsign, NONE, BODY_ARG(DEBUG_CONTEXT("Softsign")))
OP_DECL(popart, spacetodepth, spacetodepth, AiOnnxOpset11.spacetodepth, ARG(INT,blocksize) , BODY_ARG(blocksize) BODY_ARG(DEBUG_CONTEXT("Spacetodepth")))
OP_DECL(popart, split, split, AiOnnxOpset11.split, ARG(INT,num_outputs) ARG(INT,axis) ARG(INT_VEC,split) , BODY_ARG(num_outputs) BODY_ARG(axis) BODY_ARG(split) BODY_ARG(DEBUG_CONTEXT("Split")))
OP_DECL(popart, sqrt, sqrt, AiOnnxOpset11.sqrt, NONE, BODY_ARG(DEBUG_CONTEXT("Sqrt")))
OP_DECL(popart, squeeze, squeeze, AiOnnxOpset11.squeeze, ARG(INT_VEC,axes) , BODY_ARG(axes) BODY_ARG(DEBUG_CONTEXT("Squeeze")))
OP_DECL(popart, tanh, tanh, AiOnnxOpset11.tanh, NONE, BODY_ARG(DEBUG_CONTEXT("Tanh")))
OP_DECL(popart, tile, tile, AiOnnxOpset11.tile, NONE, BODY_ARG(DEBUG_CONTEXT("Tile")))
OP_DECL(popart, transpose, transpose, AiOnnxOpset11.transpose, ARG(INT_VEC,perm) , BODY_ARG(perm) BODY_ARG(DEBUG_CONTEXT("Transpose")))
OP_DECL(popart, unsqueeze, unsqueeze, AiOnnxOpset11.unsqueeze, ARG(INT_VEC,axes) , BODY_ARG(axes) BODY_ARG(DEBUG_CONTEXT("Unsqueeze")))


================================================
FILE: popart_compiler/include/popart_compiler/ManuallyAddedOperations.inc.hpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
OP_DECL(popart, reshape_static_shape, reshape, _impl->reshape,
        ARG(INT_VEC, shape), BODY_ARG(shape))
OP_DECL(poptorch, ipu_print_tensor, ipu_print_tensor, AiGraphcoreOpset1.printtensor,
        ARG(INT,print_gradient) ARG(STRING,title) ARG(INT,summariseThreshold) ARG(INT,edgeItems)
        ARG(INT,maxLineWidth) ARG(INT,digits) ARG(INT,floatFormat) ARG(CHAR,separator) ARG(CHAR,openBracket) ARG(CHAR,closeBracket) ,
        BODY_ARG(print_gradient) BODY_ARG(DEBUG_CONTEXT("Printtensor"))BODY_ARG(title) BODY_ARG(summariseThreshold) BODY_ARG(edgeItems)
        BODY_ARG(maxLineWidth) BODY_ARG(digits) BODY_ARG(floatFormat) BODY_ARG(separator) BODY_ARG(openBracket) BODY_ARG(closeBracket))
OP_DECL(poptorch, tensor_constant, tensor_constant, _impl->tensorConstant,
        POPART_CONST_ARG(popartConstant), BODY_ARG(popartConstant))
OP_DECL(poptorch, host_side_tensor_constant, host_side_tensor_constant,
        _impl->hostSideTensorConstant,
        HOST_SIDE_CONST_ARG(hostSideTensorConstant),
        BODY_ARG(hostSideTensorConstant))

OP_DECL(poptorch, constant_pad, constant_pad, AiOnnxOpset11.pad,
        NONE, BODY_ARG("constant") BODY_ARG(DEBUG_CONTEXT("Constantpad")))
OP_DECL(poptorch, reflection_pad, reflection_pad, AiOnnxOpset11.pad,
        NONE, BODY_ARG("reflect"))
OP_DECL(poptorch, edge_pad, edge_pad, AiOnnxOpset11.pad, NONE, BODY_ARG("edge")
        BODY_ARG(DEBUG_CONTEXT("Reflectionpad")))

OP_DECL(poptorch, add_not_in_place, add_not_in_place, _impl->addNotInPlace,
        NONE, NONE)

OP_DECL(poptorch, custom_operation, custom_operation, _impl->customOperation,
        ARG(STRING, name) ARG(STRING, domain) ARG(INT, version)
            ARG(INT, num_outputs) POPART_ATTRIB_VEC_ARG(attributes),
        BODY_ARG(name) BODY_ARG(domain) BODY_ARG(version) BODY_ARG(num_outputs)
            BODY_ARG(attributes))

OP_DECL_NO_RETURN(poptorch, addOutputTensor, addOutputTensor,
                  _impl->addOutputTensor, NONE, NONE)

OP_DECL(poptorch, random_uniform, random_uniform, _impl->randomUniform,
        ARG(INT_VEC, shape) ARG(FLOAT, high) ARG(FLOAT, low) ARG(STRING, dtype),
        BODY_ARG(shape) BODY_ARG(high) BODY_ARG(low) BODY_ARG(dtype))

OP_DECL(poptorch, random_normal, random_normal, _impl->randomNormal,
        ARG(INT_VEC, shape) ARG(FLOAT, mean) ARG(FLOAT, scale)
            ARG(STRING, dtype),
        BODY_ARG(shape) BODY_ARG(mean) BODY_ARG(scale) BODY_ARG(dtype))

OP_DECL(poptorch, ones, ones, _impl->ones,
        ARG(INT_VEC, shape) ARG(STRING, dtype), BODY_ARG(shape) BODY_ARG(dtype))
OP_DECL(poptorch, zeros, zeros, _impl->zeros,
        ARG(INT_VEC, shape) ARG(STRING, dtype), BODY_ARG(shape) BODY_ARG(dtype))

OP_DECL(poptorch, recomputation_checkpoint, recomputation_checkpoint,
        _impl->recomputationCheckpoint, NONE, NONE)

OP_DECL(poptorch, unfold, unfold, _impl->unfold,
        ARG(INT, dimension) ARG(INT, size) ARG(INT, step),
        BODY_ARG(dimension) BODY_ARG(size) BODY_ARG(step))

OP_DECL(poptorch, prelu, prelu, _impl->prelu, NONE, NONE)

// Operations which need extra types

#define EMPTY_FLOAT_VEC std::vector<float>()
#define EMPTY_STRING_VEC std::vector<std::string>()
#define OPTIONAL_FLOAT nonstd::optional<float>()
#define OPTIONAL_INT nonstd::optional<int64_t>()

OP_DECL(poptorch, gru, gru, AiOnnxOpset11.gru, ARG(INT, hidden_size),
        BODY_ARG(2) BODY_ARG(EMPTY_FLOAT_VEC) BODY_ARG(EMPTY_FLOAT_VEC)
        BODY_ARG(EMPTY_STRING_VEC) BODY_ARG(OPTIONAL_FLOAT)
        BODY_ARG("forward") BODY_ARG(hidden_size) BODY_ARG(1)
        BODY_ARG(DEBUG_CONTEXT("Gru")))

OP_DECL(poptorch, rnn, rnn, AiOnnxOpset11.rnn, ARG(STRING_VEC, activations),
        BODY_ARG(2) BODY_ARG(EMPTY_FLOAT_VEC) BODY_ARG(EMPTY_FLOAT_VEC)
        BODY_ARG(activations) BODY_ARG(OPTIONAL_FLOAT) BODY_ARG("forward")
        BODY_ARG(OPTIONAL_INT)
        BODY_ARG(DEBUG_CONTEXT("Rnn")))


#undef EMPTY_STRING_VEC
#undef OPTIONAL_INT
#undef OPTIONAL_FLOAT
#undef EMPTY_FLOAT_VEC


================================================
FILE: popart_compiler/include/popart_compiler/SupportedOperations.inc.hpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
/*
    OP_DECLS are in the following form:
    OP_DECL(namespace, funcName, function, onnx implementation, arguments, body argument)
     - namespace is the op's namespace
     - funcName is the op name
     - function is the actual op part of the <namespace>:<op> pair and will be
   used to name/call the given function.
     - Onnx implementation is the underlaying onnx function which will be
   called.
     - Arguments are the arguments to the op which will be parsed by different
   macros depending on which file this is in.
     - Body arguments are just the names of the arguments so they can be used in
   the cpp file.
*/
#include "CompilerOperationMacros.inc.hpp"
#include "ManuallyAddedOperations.inc.hpp"


================================================
FILE: popart_compiler/include/popart_compiler/Utils.hpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#ifndef POPART_COMPILER_UTILS_HPP
#define POPART_COMPILER_UTILS_HPP

#include <memory>
#include <string>

namespace poptorch {
namespace popart_compiler {

bool ipuModelEnvironmentVariableIsEnabled();

bool ipuSmallModelEnvironmentVariableIsEnabled();

std::string getIpuModelVersion();

int getNumTilesPerIpu(const std::string &ipu_model_version);

std::uint64_t roundUpNumIPUs(std::uint64_t num_ipus);

bool waitIfIpuIsUnavailable();

bool waitForAWhile();

/** Returns the IPU version of the device if the system contains a device with
 * num_ipus -1 if there is a device but the architecture is unknown. 0 if there
 * is no device with num_ipus.
 *
 * Note: This function doesn't check if the devices are currently in use.
 */
std::int64_t ipuHardwareVersion(std::uint64_t num_ipus = 1);

// Converts a C++ string to a unique pointer of the string array; the purpose
// is to return a "string" without using the non ABI-compatible std::string
std::unique_ptr<char[]> stringToUniquePtr(const std::string &str);

// Returns the dtype int corresponding to the onnx type string
int64_t dtypeIntFromOnnxStr(const char *onnx_type);

// Returns the Onnx datatype as string corresponding the dtype int used in Onnx
// and Popart ops which take an int64_t dtype argument, a.g. "randomnormal"
const char *onnxStrFromDtypeInt(int64_t dtype);

} // namespace popart_compiler
} // namespace poptorch

#endif // POPART_COMPILER_UTILS_HPP


================================================
FILE: popart_compiler/source/CodeletsCompilation.cpp
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
#include <sys/file.h>
#include <sys/wait.h>
#include <unistd.h>

#include <fstream>
#include <functional>
#include <regex>

#include "popart_compiler/CodeletsCompilation.hpp"
#include "popart_compiler/Utils.hpp"
#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

namespace poptorch {
namespace popart_compiler {

namespace {

// Inter-process exclusive read file lock.
class ExclusiveFileLock {
public:
  explicit ExclusiveFileLock(const std::string &path)
      : _fd(open(path.c_str(), O_RDONLY)) {
    ERROR_ON_MSG(_fd == -1, "Could not open file " << path);

    if (flock(_fd, LOCK_EX) == -1) {
      close(_fd);
      ERROR("Could not obtain an exclusive lock on file " << path);
    }
  }

  ~ExclusiveFileLock() {
    flock(_fd, LOCK_UN);
    close(_fd);
  }

private:
  int _fd;
};

// Returns the commit hash of poplar (via popc --version).
std::string poplarVersion() {
  FILE *stream = popen("popc --version", "r");
  ERROR_ON_MSG(stream == NULL,
               "Unable to read Poplar version. Is Poplar SDK enabled?");

  std::string output;
  try {
    char buffer[1024];
    while (fgets(buffer, sizeof(buffer), stream) != NULL) {
      output += buffer;
    }
  } catch (const std::exception &e) {
    pclose(stream);
    ERROR(
        "Unable to read the output of 'popc --version'. Reason: " << e.what());
  }

  ERROR_ON_MSG(pclose(stream) == -1,
               "Unable to read the output of 'popc --version'. Reason: "
                   << strerror(errno));

  std::smatch match;
  std::regex regex("([a-z0-9]{10,32})");
  if (std::regex_search(output, match, regex)) {
    return match.str();
  }

  ERROR("Unable to parse the output of 'popc --version'.");
}

// Computes a hash of the contents of a file at the specified path.
std::size_t getFileContentHash(const std::string &path) {
  std::ifstream file;
  file.open(path);
  ERROR_ON_MSG(!file.is_open(), "Could not open file " << path);

  try {
    file.seekg(0, std::ios::end);
    size_t size = file.tellg();
    std::string buffer(size, '\0');
    file.seekg(0);
    file.read(&buffer[0], size);
    return std::hash<std::string>()(buffer);
  } catch (const std::exception &e) {
    ERROR("Could not read file " << path << ". Reason: " << e.what());
  }
}

// Final path is of form:
// <src_file_path_without_extension>-<src_hash>-<poplar_hash>.gp
std::string compiledCodeletPath(const std::string &src_file_path) {
  std::size_t src_hash = getFileContentHash(src_file_path);
  std::string poplar_version = poplarVersion();
  // Remove the '.inc.cpp' file extension.
  std::string out_file_path = src_file_path.substr(0, src_file_path.size() - 8);
  out_file_path += "-";
  out_file_path += std::to_string(src_hash);
  out_file_path += "-";
  out_file_path += poplar_version;
  out_file_path += ".gp";
  return out_file_path;
}

void compileCodelet(const std::string &src_file_path,
                    const std::string &out_file_path,
                    const std::string &target) {
  int pipe_fd[2];
  ERROR_ON_MSG(pipe(pipe_fd) == -1,
               "Could not compile codelet "
                   << src_file_path
                   << ", pipe failed. Reason: " << strerror(errno));

  pid_t child_pid = fork();
  ERROR_ON_MSG(child_pid == -1,
               "Could not compile codelet "
                   << src_file_path
                   << ", fork failed. Reason: " << strerror(errno));

  if (child_pid == 0) {
    // No reason to ERROR_ON_MSG as we can't see stdout/stderr at this point.
    ERROR_ON(close(pipe_fd[0]) == -1);
    ERROR_ON(setpgid(0, 0) == -1);
    // Pipe stdout and stderr to the parent process.
    ERROR_ON(dup2(pipe_fd[1], STDOUT_FILENO) == -1);
    ERROR_ON(dup2(pipe_fd[1], STDERR_FILENO) == -1);
    ERROR_ON_MSG(close(pipe_fd[1]) == -1,
                 "Could not compile codelet "
                     << src_file_path
                     << ", closing child write pipe failed. Reason: "
                     << strerror(errno));

    char *const argv[] = {const_cast<char *>("popc"),
                          const_cast<char *>("-target"),
                          const_cast<char *>(target.c_str()),
                          const_cast<char *>("-O3"),
                          const_cast<char *>(src_file_path.c_str()),
                          const_cast<char *>("-o"),
                          const_cast<char *>(out_file_path.c_str()),
                          NULL};

    std::string path_env_var = "PATH=" + std::string(std::getenv("PATH"));
    char *const env[] = {const_cast<char *>(path_env_var.c_str()), NULL};

    execvpe("popc", argv, env);
    // 'exec' only returns on failure.
    _exit(EXIT_FAILURE);
  } else {
    // Close the write end.
    ERROR_ON_MSG(close(pipe_fd[1]) == -1,
                 "Could not compile codelet "
                     << src_file_path
                     << ", closing parent write pipe failed. Reason: "
                     << strerror(errno));
    int status;
    ERROR_ON_MSG(waitpid(child_pid, &status, 0) == -1,
                 "Could not compile codelet "
                     << src_file_path
                     << ", waiting for child process failed. Reason: "
                     << strerror(errno));

    // Return on success and report errors on failures.
    std::string exit_reason;
    if (WIFEXITED(status)) {
      if (WEXITSTATUS(status) == 0) {
        // Child exited successfully.
        ERROR_ON_MSG(close(pipe_fd[0]) == -1,
                     "Could not compile codelet "
                         << src_file_path
                         << ", closing parent read pipe failed. Reason: "
                         << strerror(errno));
        return;
      }
      // Child exited with non-zero code.
      exit_reason = "child failed with exit code ";
      exit_reason += std::to_string(WEXITSTATUS(status));
      exit_reason += ".";

      // Read child stdout and stderr and let the user know what happened.
      FILE *stream = fdopen(pipe_fd[0], "r");
      std::string output;
      try {
        char buffer[1024];
        while (fgets(buffer, sizeof(buffer), stream) != NULL) {
          output += buffer;
        }
        exit_reason += " 'popc' output was:\n";
        exit_reason += output;
      } catch (const std::exception &) {
        // Only report that 'popc' command failed and ignore this error.
      }
      fclose(stream);
    } else if (WIFSIGNALED(status)) {
      // Child killed by a signal.
      exit_reason = "child killed with a signal ";
      exit_reason += std::to_string(WTERMSIG(status));
      exit_reason += " (";
      exit_reason += strsignal(WTERMSIG(status));
      exit_reason += ").";
    } else {
      exit_reason = "child failure unknown.";
    }

    close(pipe_fd[0]);
    ERROR("Could not compile codelet " << src_file_path << ", " << exit_reason);
  }
}

// True filesystem python package path where codelet sources are stored.
// It gets initialized on first 'import poptorch' from python.
std::string custom_codelets_path;

} // namespace

void setCustomCodeletsPath(const char *cache_path) {
  if (custom_codelets_path.empty()) {
    custom_codelets_path = cache_path;
  }
}

std::unique_ptr<char[]> compileCustomCodeletIfNeeded(const char *src_file_name,
                                                     bool hw_only_codelet) {
  logging::LogContext ctx("CompileCustomCodeletIfNeeded");
  logging::debug("Inspecting whether custom codelet {} needs to be compiled",
                 src_file_name);

  // Should never happen.
  ERROR_ON(custom_codelets_path.empty());

  std::string src_file_path = custom_codelets_path;
  src_file_path += "/";
  src_file_path += src_file_name;

  // Lock the src file to make sure only a single process does the compilation.
  ExclusiveFileLock lock(src_file_path);

  std::string out_file_path = compiledCodeletPath(src_file_path);

  // Skip compilation if codelet is already compiled.
  std::ifstream out_file;
  out_file.open(out_file_path);
  if (out_file.is_open()) {
    logging::debug("Custom codelet {} already compiled", src_file_name);
    out_file.close();
    return stringToUniquePtr(out_file_path);
  }

  std::string target;
  std::int64_t hw_version = ipuHardwareVersion();
  if (hw_only_codelet) {
    ERROR_ON_MSG(
        hw_version == 0 || hw_version == -1,
        "Can't infer IPU hardware version, are there any IPUs in the system?");
    target = "ipu" + std::to_string(hw_version);
  } else if (hw_version == 0 || hw_version == -1) {
    target = "cpu," + getIpuModelVersion();
  } else {
    target = "cpu,ipu" + std::to_string(hw_version);
  }

  logging::debug("Compiling custom codelet {} for target {}", src_file_name,
                 target);

  compileCodelet(src_file_path, out_file_path, target);
  return stringToUniquePtr(out_file_path);
}

} // namespace popart_compiler
} // namespace poptorch


================================================
FILE: popart_compiler/source/Compiler.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#include <atomic>
#include <chrono>
#include <cstring>
#include <fstream>
#include <iostream>
#include <stack>
#include <string>
#include <thread>

#include <spdlog/fmt/fmt.h>
#include <spdlog/fmt/ostr.h>

#include <popart/graphtransformer.hpp>
#include <popart/ndarraywrapper.hpp>
#include <popart/optimizer.hpp>
#include <popart/popx/devicex.hpp>
#include <popart/variablesettings.hpp>
#include <popef/Reader.hpp>
#include <popef/Writer.hpp>
#include <poplar/exceptions.hpp>
#include <poputil/exceptions.hpp>

#include "popart_compiler/Compiler.hpp"
#include "popart_compiler/CompilerImpl.hpp"
#include "popart_compiler/CustomOps.hpp"
#include "popart_compiler/MultiConvBuilder.hpp"
#include "popart_compiler/PopartEnums.hpp"
#include "popart_compiler/SessionOptionsImpl.hpp"
#include "popart_compiler/Utils.hpp"

#include "poptorch_err/ExceptionInfo.hpp"

#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

namespace poptorch {
namespace popart_compiler {
namespace {

void saveModelProtoIfNeeded(popart::Builder *builder,
                            const char *export_proto_filename) {
  const std::string filename = export_proto_filename;
  if (!filename.empty()) {
    // Important: popart_compiler is compiled using C++ 14 and therefore
    // doesn't have access to the filesystem utilities so the caller is
    // responsible for making sure the directories exist and the
    // filename is a valid filename.
    std::ofstream fs(filename);
    bool human_readable = true;
    if (const char *proto_as_bin =
            std::getenv("POPTORCH_EXPORT_PROTO_AS_BINARY")) {
      human_readable = std::stoi(proto_as_bin) == 0;
    }
    if (human_readable) {
      logging::info("Exporting model proto as text (Set "
                    "POPTORCH_EXPORT_PROTO_AS_BINARY=1 to export as binary)");
    } else {
      logging::info("Exporting model proto as binary (Set "
                    "POPTORCH_EXPORT_PROTO_AS_BINARY=0 to export as human "
                    "readable text)");
    }
    fs << builder->getModelProto(human_readable);
    fs.close();
  }
}

// Helper to let us filter string arguments into const char*s. This is to catch
// the std::string produced by some attributes before they cross the ABI
// boundary.
template <typename T> T convertType(T &&t) { return std::forward<T>(t); }

std::vector<std::string> convertType(std::vector<const char *> v) {
  return std::vector<std::string>(v.begin(), v.end());
}

// Convert an overlap string to a PopART TileSet and Exchange Strategy
std::pair<popart::TileSet, popart::ExchangeStrategy>
exchangeStrToPopartEnum(const char *overlap) {
  std::pair<popart::TileSet, popart::ExchangeStrategy> tile_set_and_strat(
      popart::TileSet::Compute, popart::ExchangeStrategy::JustInTime);

  if (strcmp(overlap, "overlap_accumulation_loop") == 0) {
    tile_set_and_strat.first = popart::TileSet::IO;
    tile_set_and_strat.second = popart::ExchangeStrategy::OverlapInnerLoop;
  } else if (strcmp(overlap, "overlap_device_iteration_loop") == 0) {
    tile_set_and_strat.first = popart::TileSet::IO;
    tile_set_and_strat.second = popart::ExchangeStrategy::OverlapLoops;
  } else {
    ERROR_ON(strcmp(overlap, "no_overlap") != 0);
  }

  return tile_set_and_strat;
}

// Variadic output case. For now we will add all outputs to the graph and
// allocate them on the same IPU but we will only return one. This means only
// one output can be used by user IR (but can still be used by the backed via
// transformations).
template <typename T> struct HandleOutput {
  TensorId operator()(T &in, bool loss, detail::CompilerImpl *_impl) {
    ERROR_ON_MSG(loss, "Unreachable internal error: no operation with multiple "
                       "returns is expected to be a loss.");

    std::set<popart::TensorId> ids;

    for (const popart::TensorId &id : in) {
      ids.insert(id);
      _impl->ids.push_back(id);
    }

    _impl->setExecutionStrategyAttributes(ids);

    // Return the first added tensor as the sole return of this IR op.
    return _impl->ids.size() - in.size();
  }
};

// Single tensor output case
template <> struct HandleOutput<popart::TensorId> {
  TensorId operator()(const popart::TensorId &in, bool loss,
                      detail::CompilerImpl *_impl) {
    // See if any available memory has been set for this IPU.
    auto itr =
        _impl->options.available_memory_proportion.find(_impl->active_ipu);
    if (itr != _impl->options.available_memory_proportion.end()) {
      logging::info("Setting memory proportion on tensor {} to {}. On IPU {}",
                    in, itr->second, itr->first);
      _impl->active_builder->setAvailableMemoryProportion(in, itr->second);
    }

    _impl->ids.push_back(in);
    if (!_impl->active_builder->nodeHasAttribute(
            popart::sPipelineStageAttribute, {in}) &&
        !_impl->active_builder->nodeHasAttribute(
            popart::sExecutionPhaseAttribute, {in})) {
      _impl->setExecutionStrategyAttributes({in});
    }

    if (loss) {
      _impl->loss = in;
    }

    return _impl->ids.size() - 1;
  }
};

// Host side constant case
template <> struct HandleOutput<TensorId> {
  TensorId operator()(TensorId in, bool loss, detail::CompilerImpl *_impl) {
    UNUSED(loss);
    ERROR_ON(!_impl->isHostSideConstant(in));
    return in;
  }
};

// A whitelist of supported loss operations. Popart needs to know which
// operations are losses so they can be marked by the session.
bool IsLoss(const std::string &operation) {
  return operation == "popart::identityloss";
}

} // namespace

void copyParam(Optimizer &dest_optim, const Optimizer &source_optim,
               const char *source, const char *dest) {
  const float *source_float = nullptr;
  const bool *source_is_const = nullptr;
  float *dest_float = nullptr;
  bool *dest_is_const = nullptr;

  for (const auto &param : source_optim.parameters) {
    const char *param_name = static_cast<const char *>(param.name);
    if (strcmp(param_name, source) == 0) {
      source_float = &param.value;
      source_is_const = &param.is_const;
    }
  }

  for (auto &param : dest_optim.parameters) {
    const char *param_name = static_cast<const char *>(param.name);
    if (strcmp(param_name, dest) == 0) {
      dest_float = &param.value;
      dest_is_const = &param.is_const;
    }
  }

  if ((source_float != nullptr) && (dest_float != nullptr)) {
    ERROR_ON(!source_is_const);
    ERROR_ON(!dest_is_const);

    logging::debug("Set {} ({}) to {} ({})", dest, *dest_float, source,
                   *source_float);

    (*dest_float) = (*source_float);
    (*dest_is_const) = (*source_is_const);
  }
}

PopartAttribute::PopartAttribute(const char *name, const int64_t &value)
    : _name(stringToUniquePtr(name)), _any(new popart::any(value)) {}
PopartAttribute::PopartAttribute(const char *name,
                                 const std::vector<int64_t> &values)
    : _name(stringToUniquePtr(name)), _any(new popart::any(values)) {}
PopartAttribute::PopartAttribute(const char *name, const float &value)
    : _name(stringToUniquePtr(name)), _any(new popart::any(value)) {}
PopartAttribute::PopartAttribute(const char *name,
                                 const std::vector<float> &values)
    : _name(stringToUniquePtr(name)), _any(new popart::any(values)) {}

PopartAttribute::PopartAttribute(const char *name,
                                 const std::unique_ptr<char[]> &str)
    : _name(stringToUniquePtr(name)),
      _any(new popart::any(std::string(str.get()))) {}

PopartAttribute::PopartAttribute(
    const char *name, const std::vector<std::unique_ptr<char[]>> &strs)
    : _name(stringToUniquePtr(name)) {
  std::vector<std::string> strs_new;
  strs_new.reserve(strs.size());
  for (const auto &str : strs) {
    strs_new.emplace_back(str.get());
  }
  _any = std::make_unique<popart::any>(std::move(strs_new));
}

PopartAttribute::PopartAttribute(PopartAttribute &&) = default;
PopartAttribute &PopartAttribute::operator=(PopartAttribute &&) = default;
PopartAttribute::~PopartAttribute() = default;

popart::any *PopartAttribute::getValue() { return _any.get(); }

PopartConstant::PopartConstant(const PopartType &popart_type, const void *data,
                               const std::vector<std::int64_t> &shape) {
  ERROR_ON_MSG(popart_type == PopartType::DOUBLE,
               "Adding a double constant is not supported. "
               "This should have been demoted to a float");

  const popart::TensorInfo info{toPopartTypeStr(popart_type), shape};
  _data = std::make_unique<popart::ConstVoidData>(data, info);
}

PopartConstant::~PopartConstant() = default;

HostSideConstant::HostSideConstant(const PopartType &popart_type, void *data,
                                   size_t data_size,
                                   std::vector<std::int64_t> shape)
    : _popart_type(popart_type), _shape(std::move(shape)) {
  _data.resize(data_size);
  std::memcpy(_data.data(), data, data_size);
}

void HostSideConstant::copyDataTo(void *ptr) const {
  std::memcpy(ptr, _data.data(), _data.size());
}

TensorId Compiler::addInputTensor(const char *type,
                                  const std::vector<std::int64_t> &dims,
                                  const char *overlap) {
  // Create the tensor info for our new tensor.
  const popart::TensorInfo info{type, dims};
  popart::InputSettings settings;

  const auto tile_set_and_strat = exchangeStrToPopartEnum(overlap);
  if (tile_set_and_strat.second != popart::ExchangeStrategy::JustInTime) {
    _impl->using_overlapped_io = true;
  }

  settings.setTileSet(tile_set_and_strat.first);
  settings.setExchangeStrategy(tile_set_and_strat.second);

  const auto popart_id = _impl->active_builder->addInputTensor(info, settings);
  _impl->inputs.push_back(popart_id);
  _impl->ids.push_back(popart_id);
  return _impl->ids.size() - 1;
}

TensorId Compiler::createTensorId(const char *name) {
  const popart::TensorId tensor(name);
  _impl->ids.push_back(tensor);
  return _impl->ids.size() - 1;
}

#define INT_VEC std::vector<std::int64_t>
#define FLOAT_VEC std::vector<float>
#define FLOAT float
#define INT std::int64_t
#define BOOL bool
#define DEBUG_CONTEXT(Name) _impl->getDebugContext(Name)
#define CHAR char
#define STRING const char *
#define STRING_VEC std::vector<const char *>
#define NONE
#define ARG(Type, Name) , Type Name
#define POPART_CONST_ARG(Name) , const PopartConstant &Name
#define HOST_SIDE_CONST_ARG(Name) , const HostSideConstant &Name
#define POPART_ATTRIB_VEC_ARG(Name)                                            \
  , std::shared_ptr<std::vector<PopartAttribute>> Name
#define BODY_ARG(Name) , convertType(Name)

// Create a function decl with the given call and arguments.
#define OP_DECL(ns, funcName, function, onnxImpl, Args, BodyArgs)              \
  TensorId Compiler::function(const std::vector<TensorId> &inputs Args) {      \
    auto AiOnnxOpset11 = _impl->active_builder->aiOnnxOpset11();               \
    auto AiGraphcoreOpset1 = _impl->active_builder->aiGraphcoreOpset1();       \
    const bool isLoss = IsLoss(#ns "::" #funcName);                            \
    std::vector<popart::TensorId> ins;                                         \
    std::transform(inputs.begin(), inputs.end(), std::back_inserter(ins),      \
                   [&](TensorId index) { return _impl->ids[index]; });         \
    auto output = onnxImpl(ins BodyArgs);                                      \
    return HandleOutput<decltype(output)>{}(output, isLoss, _impl.get());      \
  }

// Create a function decl with the given call and arguments.
#define OP_DECL_NO_RETURN(ns, funcName, function, onnxImpl, Args, BodyArgs)    \
  void Compiler::function(const std::vector<TensorId> &inputs Args) {          \
    auto AiOnnxOpset11 = _impl->active_builder->aiOnnxOpset11();               \
    auto AiGraphcoreOpset1 = _impl->active_builder->aiGraphcoreOpset1();       \
    std::vector<popart::TensorId> ins;                                         \
    std::transform(inputs.begin(), inputs.end(), std::back_inserter(ins),      \
                   [&](TensorId index) { return _impl->ids[index]; });         \
    onnxImpl(ins BodyArgs);                                                    \
  }

#include "popart_compiler/SupportedOperations.inc.hpp"

#undef OP_DECL
#undef OP_DECL_NO_RETURN
#undef BODY_ARG
#undef POPART_ATTRIB_VEC_ARG
#undef POPART_CONST_ARG
#undef HOST_SIDE_CONST_ARG
#undef ARG
#undef NONE
#undef STRING_VEC
#undef CHAR
#undef STRING
#undef BOOL
#undef INT
#undef FLOAT
#undef FLOAT_VEC
#undef INT_VEC
#undef DEBUG_CONTEXT

TensorId
Compiler::addInitializedInputTensor(const char *name, const char *type,
                                    const std::vector<std::int64_t> &dims,
                                    void *data) {
  // Create the tensor info for our new tensor.
  const popart::TensorInfo info{type, dims};

  // Create the inital data for the variable.
  const popart::ConstVoidData the_data{data, info};

  _impl->ids.push_back(
      _impl->active_builder->addInitializedInputTensor(the_data, name));

  const popart::TensorId &id = _impl->ids[_impl->ids.size() - 1];

  _impl->weights.registerParameter(id, info);

  return _impl->ids.size() - 1;
}

TensorId Compiler::addInitializedInputTensor(
    const char *name, const char *type, const std::vector<std::int64_t> &dims,
    void *data, int comm_group_type, int shards, int variable_retrieval_mode) {
  // Create the tensor info for our new tensor.
  const popart::TensorInfo info{type, dims};

  // Create the inital data for the variable.
  const popart::ConstVoidData the_data{data, info};

  const popart::VariableSettings settings(
      popart::CommGroup(popart::CommGroupType(comm_group_type), shards),
      popart::VariableRetrievalMode(variable_retrieval_mode));

  _impl->ids.push_back(_impl->active_builder->addInitializedInputTensor(
      the_data, settings, name));

  const popart::TensorId &id = _impl->ids[_impl->ids.size() - 1];

  _impl->weights.registerParameter(id, info);

  return _impl->ids.size() - 1;
}

void Compiler::addOutputTensor(TensorId output, PopartOutputMode output_mode,
                               size_t output_return_period,
                               const char *overlap) {
  _impl->outputs.push_back(_impl->ids[output]);

  if (isHostSideConstant(output)) {
    return; // Nothing more to do
  }

  if (output_mode == PopartOutputMode::N) {
    output_mode = _impl->options.output_mode;
    if (output_mode == PopartOutputMode::EveryN) {
      output_return_period = _impl->options.output_return_period;
    }
  }

  const auto tile_set_and_strat = exchangeStrToPopartEnum(overlap);
  if (tile_set_and_strat.second != popart::ExchangeStrategy::JustInTime) {
    _impl->using_overlapped_io = true;
  }

  // Check for any use of overlapped io
  // NB this relies on the fact that manual anchors never overlap and other
  // outputs all have the same output_mode. If these assumptions change,
  // the logic will have to make sure _impl->using_overlapped_io is correct
  // before any call to this function rather than changed to true on the first
  // instance.
  if (_impl->using_overlapped_io) {
    verifySettingsForOverlappedIO(output_mode);
  }

  const char *as_str = outputModeToString(output_mode);

  // If we are returning EveryN we need to pass in the return period.
  if (output_mode == PopartOutputMode::EveryN) {
    _impl->anchors.insert({_impl->ids[output], popart::AnchorReturnType(
                                                   as_str, output_return_period,
                                                   tile_set_and_strat.first,
                                                   tile_set_and_strat.second)});
  } else {
    _impl->anchors.insert(
        {_impl->ids[output],
         popart::AnchorReturnType(as_str, tile_set_and_strat.first,
                                  tile_set_and_strat.second)});
  }
}

template <typename T>
static void setUpInputImpl(TensorId id, T *ptr,
                           const std::vector<std::int64_t> &dims,
                           detail::CompilerImpl *impl) {
  // Popart wrapper around the tensor pointer.
  impl->memory_manager.push_back(
      std::make_unique<popart::NDArrayWrapper<T>>(ptr, dims));
  impl->popart_incoming.insert(
      {impl->ids[id], *impl->memory_manager.back().get()});
}

void Compiler::setUpInputOp(TensorId id, float *ptr,
                            const std::vector<std::int64_t> &dims) {
  assertTensorIs(PopartType::FLOAT, id);
  setUpInputImpl(id, ptr, dims, _impl.get());
}

void Compiler::setUpInputOp(TensorId id, std::int32_t *ptr,
                            const std::vector<std::int64_t> &dims) {
  assertTensorIs(PopartType::INT32, id);
  setUpInputImpl(id, ptr, dims, _impl.get());
}

void Compiler::setUpInputOp(TensorId id, bool *ptr,
                            const std::vector<std::int64_t> &dims) {
  assertTensorIs(PopartType::BOOL, id);
  setUpInputImpl(id, ptr, dims, _impl.get());
}

void Compiler::setUpInputOp(TensorId id, std::int8_t *ptr,
                            const std::vector<std::int64_t> &dims) {
  assertTensorIs(PopartType::INT8, id);
  setUpInputImpl(id, ptr, dims, _impl.get());
}

void Compiler::setUpInputOp(TensorId id, std::uint8_t *ptr,
                            const std::vector<std::int64_t> &dims) {
  assertTensorIs(PopartType::UINT8, id);
  setUpInputImpl(id, ptr, dims, _impl.get());
}

void Compiler::setUpInputOp(TensorId id, std::int16_t *ptr,
                            const std::vector<std::int64_t> &dims,
                            bool float16) {
  if (float16) {
    assertTensorIs(PopartType::FLOAT16, id);
  } else {
    assertTensorIs(PopartType::INT16, id);
  }

  // Popart wrapper around the tensor pointer.
  _impl->memory_manager.push_back(
      std::make_unique<popart::NDArrayWrapper<std::int16_t>>(
          ptr, popart::TensorInfo(float16 ? popart::DataType::FLOAT16
                                          : popart::DataType::INT16,
                                  dims)));
  _impl->popart_incoming.insert(
      {_impl->ids[id], *_impl->memory_manager.back().get()});
}

template <typename T>
static void addOutput(TensorId id, T *ptr,
                      const std::vector<std::int64_t> &dims,
                      detail::CompilerImpl *impl) {
  // Popart wrapper around the tensor pointer.
  auto memory =
      std::make_unique<popart::NDArrayWrapper<T>>(static_cast<T *>(ptr), dims);

  impl->addMemoryToOutput(id, ptr, std::move(memory));
}

void Compiler::setUpOutputOp(TensorId id, std::uint8_t *ptr,
                             const std::vector<std::int64_t> &dims) {
  addOutput(id, ptr, dims, _impl.get());
}

void Compiler::setUpOutputOp(TensorId id, std::int8_t *ptr,
                             const std::vector<std::int64_t> &dims) {
  addOutput(id, ptr, dims, _impl.get());
}

void Compiler::setUpOutputOp(TensorId id, float *ptr,
                             const std::vector<std::int64_t> &dims) {
  addOutput(id, ptr, dims, _impl.get());
}

void Compiler::setUpOutputOp(TensorId id, std::int32_t *ptr,
                             const std::vector<std::int64_t> &dims) {
  addOutput(id, ptr, dims, _impl.get());
}

void Compiler::setUpOutputOp(TensorId id, bool *ptr,
                             const std::vector<std::int64_t> &dims) {
  addOutput(id, ptr, dims, _impl.get());
}

void Compiler::setUpOutputOp(TensorId id, std::int16_t *ptr,
                             const std::vector<std::int64_t> &dims) {
  addOutput(id, ptr, dims, _impl.get());
}

void Compiler::initSession(const std::vector<Optimizer> &optimizers,
                           const char *export_proto_filename) {
  const logging::LogContext ctx_init_session{"Compiler::initSession"};

  logging::trace("Initializing session");

  // Some simple PyTorch models will not need an IPU at all. However, we do not
  // want users to experience error messages as these may be trivial models
  // which users try in their first use of PopTorch.
  if (_impl->used_ipus.empty()) {
    logging::info("No IPUs are used by this model. This may happen if the "
                  "model is trivial");
    return;
  }

  const auto device = _impl->createDevice();
  popart::SessionOptions &options = _impl->popart_options;

  if (options.engineOptions.count("debug.retainDebugInformation") == 0) {
    options.engineOptions.emplace("debug.retainDebugInformation", "false");
    // Message has to be consistent with format used by setOptionIfNotSet()
    logging::debug(
        "engineOptions[debug.retainDebugInformation] set to value false");
  }
  // 'Auto' mode works if only one IPU is used per replica, and allows
  // overlapped IO to work. Excerpt from D51863 in PopART:
  // IO tiles can only be used when virtual graphs are enabled. Virtual graph
  // modes enable to assign tensors and operations to a subset of IPUs, and
  // within each IPU, to a subset of tiles (such as compute and IO tiles). The
  // supported modes are one of: {Manual, Auto, ExecutionPhases}.
  popart::VirtualGraphMode graph_mode = popart::VirtualGraphMode::Auto;
  // If Pipelining wasn't set: enable it if more than 1 IPU is used.
  switch (_impl->options.execution_mode) {
  case detail::ExecutionMode::Pipelined: {
    _impl->setOptionIfNotSet(options.enablePipelining,
                             _impl->used_ipus.size() > 1, "enablePipelining");
    // If we are pipelining we want to turn on recompute by default.
    if (_impl->used_ipus.size() > 1) {
      graph_mode = popart::VirtualGraphMode::Manual;
      _impl->setOptionIfNotSet(
          options.autoRecomputation, popart::RecomputationType::Pipeline,
          "autoRecomputation",
          popart::toString(popart::RecomputationType::Pipeline));
    }

    // TODO(T53152): AccumulateOuterFragmentSchedule::Serial is currently
    // incompatible with gradient clipping and pipelining.
    for (const auto &optimizer : optimizers) {
      if (optimizer.max_grad_norm != std::numeric_limits<float>::infinity()) {
        _impl->setOptionIfNotSet(
            options.accumulateOuterFragmentSettings.schedule,
            popart::AccumulateOuterFragmentSchedule::Scheduler,
            "accumulateOuterFragmentSettings.schedule",
            "AccumulateOuterFragmentSchedule::Scheduler");
        break;
      }
    }

    break;
  }
  case detail::ExecutionMode::Sharded: {
    _impl->setOptionIfNotSet(options.enablePipelining, false,
                             "enablePipelining");
    if (_impl->used_ipus.size() > 1 || _impl->using_overlapped_io) {
      graph_mode = popart::VirtualGraphMode::Manual;
    }
    break;
  }
  case detail::ExecutionMode::Phased: {
    _impl->setOptionIfNotSet(options.enablePipelining, false,
                             "enablePipelining");
    graph_mode = popart::VirtualGraphMode::ExecutionPhases;
    std::uint64_t num_phases = _impl->max_phase + 1;
    const std::uint64_t num_stages =
        _impl->options.serial_phases_execution ? 1 : 2;
    if (_impl->options.tensors_liveness != detail::Liveness::AlwaysLive) {
      // We want to send the tensors off chip: Tensors stay live through
      // phases N, N+1, N+2 so we need to have a gap of 3 before the bwd
      // pass, otherwise the bwd pass will start in the same phase as the
      // end of the fwd pass.
      num_phases += 3;
    } else if (_impl->options.separate_backward_phase) {
      // Make sure the backward pass will start with a new phase.
      num_phases += 1;
    }

    _impl->setOptionIfNotSet(options.executionPhaseSettings.phases, num_phases,
                             "executionPhaseSettings.phases");
    _impl->setOptionIfNotSet(options.executionPhaseSettings.stages, num_stages,
                             "executionPhaseSettings.stages");
    _impl->setOptionIfNotSet(
        options.activationTensorLocationSettings.location.storage,
        popart::TensorStorage::OffChip, "location_activation",
        "useOnChipStorage(False)");
    _impl->setOptionIfNotSet(
        options.weightTensorLocationSettings.location.storage,
        popart::TensorStorage::OffChip, "location_weight",
        "useOnChipStorage(False)");
    _impl->setOptionIfNotSet(
        options.optimizerStateTensorLocationSettings.location.storage,
        popart::TensorStorage::OffChip, "location_optimizer",
        "useOnChipStorage(False)");
    _impl->setOptionIfNotSet(
        options.accumulatorTensorLocationSettings.location.storage,
        popart::TensorStorage::OffChip, "location_accumulator",
        "useOnChipStorage(False)");
    break;
  }
  default:
    ERROR("ExecutionMode not supported");
  }
  // By default allow the user to save / restore the RNG state (It uses slightly
  // more memory).
  _impl->setOptionIfNotSet(options.enableLoadAndOffloadRNGState, true,
                           "enableLoadAndOffloadRNGState");

  _impl->setOptionIfNotSet(options.virtualGraphMode, graph_mode,
                           "virtualGraphMode", popart::toString(graph_mode));

  _impl->setOptionIfNotSet(options.enableDistributedReplicatedGraphs,
                           _impl->options.num_distributed_processes > 1,
                           "enableDistributedReplicatedGraphs");

  _impl->setOptionIfNotSet(options.globalReplicationFactor,
                           _impl->options.num_distributed_processes *
                               options.replicatedGraphCount,
                           "globalReplicationFactor");
  _impl->setOptionIfNotSet(options.globalReplicaOffset,
                           _impl->options.distributed_process_id *
                               options.replicatedGraphCount,
                           "globalReplicaOffset");

  _impl->setOptionIfNotSet(options.enableReplicatedGraphs,
                           options.replicatedGraphCount > 1,
                           "enableReplicatedGraphs");

  // Disable constant_weights by default: causes problems with Popart
  _impl->setOptionIfNotSet(options.constantWeights, false, "constantWeights");

  if (_impl->options.execution_mode == detail::ExecutionMode::Pipelined) {
    const auto num_pipeline_stages = _impl->numPipelineStages();

    if (_impl->is_training) {
      const auto num_forward_stages = (num_pipeline_stages + 1) / 2;
      const auto num_backward_stages = (num_pipeline_stages - 1) / 2;

      const std::string err_msg = fmt::format(
          "poptorch.Options().Training.gradientAccumulation must be greater "
          "than or equal to the number of pipeline stages ({}) when using "
          "poptorch.PipelinedExecution. Please note that a model with {} "
          "pipeline stages in PopTorch will have an additional {} stages when "
          "training.",
          num_pipeline_stages, num_forward_stages, num_backward_stages);

      ERROR_ON_MSG(_impl->popart_options.accumulationFactor <
                       static_cast<int64_t>(num_pipeline_stages),
                   err_msg);
    } else {
      const std::string err_msg =
          fmt::format("poptorch.Options().deviceIterations must be greater "
                      "than or equal to the number of pipeline stages ({}) "
                      "when using PopTorch.PipelinedExecution.",
                      num_pipeline_stages);

      ERROR_ON_MSG(_impl->options.steps < num_pipeline_stages, err_msg);
    }
  }

  _impl->setOptionIfNotSet(options.enableGradientAccumulation,
                           options.accumulationFactor > 1,
                           "enableGradientAccumulation");

  // Only explicitly set these options if overlapped I/O are used
  // otherwise we might be overwriting the values set implicitly
  // by some other PopART options (like for example enableExplicitIR()).
  if (_impl->using_overlapped_io) {
    // This is needed for both overlapped IO and explicit pipelining (not yet)
    // supported.
    _impl->setOptionIfNotSet(options.useHostCopyOps, _impl->using_overlapped_io,
                             "useHostCopyOps");

    // This is needed but may cause regressions for existing models. When it is
    // more developed, this will become the default.
    _impl->setOptionIfNotSet(options.enableExplicitMainLoops,
                             _impl->using_overlapped_io,
                             "enableExplicitMainLoops");
  }

  // Create the anchors, these are used to copy to the host.
  const auto data_flow = popart::DataFlow(_impl->options.steps, _impl->anchors);

  // Save the initializers to an external file if requested.
  if (!_impl->options.external_initializers_file.empty()) {
    const logging::LogContext ctx{
        "popart::Builder::saveInitializersExternally"};
    logging::trace("Saving initializers to external file {}",
                   _impl->options.external_initializers_file);
    _impl->active_builder->saveInitializersExternally(
        _impl->weights.parameterIds(),
        _impl->options.external_initializers_file);
  }

  const auto model_name_set = _impl->options_set.count("model_name") > 0;

  // Tensor location in PopART includes a shardingDomain option which sets
  // which replicas to shard tensors across when using replicated tensor
  // sharding. For now, only one option works for multiple processes, which is
  // to set the type to consecutive across the number of local replica (which
  // is equal to options.replicatedGraphCount on each process).
  //
  // The setting for a single process remains the default (All) which shards
  // tensors across all replica.
  //
  // In future, GCL and PopART will support additional options, which can be
  // exposed to the user.
  if (_impl->options.num_distributed_processes > 1) {
    const popart::CommGroup sharding_domain(popart::CommGroupType::Consecutive,
                                            options.replicatedGraphCount);
    options.activationTensorLocationSettings.location.shardingDomain =
        sharding_domain;
    options.weightTensorLocationSettings.location.shardingDomain =
        sharding_domain;
    options.optimizerStateTensorLocationSettings.location.shardingDomain =
        sharding_domain;
    options.accumulatorTensorLocationSettings.location.shardingDomain =
        sharding_domain;
  }

  saveModelProtoIfNeeded(_impl->active_builder, export_proto_filename);

  // Create the popart session object to actually run the graph.
  if (!_impl->is_training) {
    // Create an inference session.
    const logging::LogContext ctx{
        "popart::InferenceSession::createFromOnnxModel"};
    _impl->session = popart::InferenceSession::createFromOnnxModel(
        _impl->active_builder->getModelProto(), data_flow, device, {}, options,
        popart::PatternsLevel::Default,
        model_name_set ? _impl->options.model_name : "inference");
  } else {
    // Create the optimizer from user provided parameters.
    const std::unique_ptr<popart::Optimizer> optimizer =
        _impl->getPopartOptimizer(optimizers);

    // Create the training session.
    const logging::LogContext ctx{
        "popart::TrainingSession::createFromOnnxModel"};
    _impl->session = popart::TrainingSession::createFromOnnxModel(
        _impl->active_builder->getModelProto(), data_flow, _impl->loss,
        *optimizer, device, {}, options, _impl->options.patterns,
        model_name_set ? _impl->options.model_name : "training");
  }
}

void Compiler::saveExecutableToFile(const char *export_filename) const {
  ERROR_ON_MSG(!_impl->session,
               "Nothing to export. This may be because the model does not run "
               "any op on the IPU.");

  const logging::LogContext ctx_function{"Compiler::saveExecutableToFile"};
  const logging::LogContext ctx{"popart::Session::saveExecutable"};
  _impl->session->saveExecutable(export_filename);
}

void Compiler::setRngState(std::uint64_t seed,
                           const std::vector<std::uint32_t> &rng_state) {
  ERROR_ON_MSG(!_impl->session, "Session should be initialised first");
  logging::debug("Setting random seed to: {}", seed);
  if (_impl->session->getIr().getRequiresRandomSeed()) {
    _impl->session->setRandomSeed(seed);
  } else {
    logging::debug("Session has no random behaviour: nothing to do.");
  }
  if (!rng_state.empty()) {
    logging::debug("Setting RNG state");
    _impl->session->setRNGState(rng_state);
  }
}

std::vector<std::uint32_t> Compiler::getRngState() const {
  ERROR_ON_MSG(!_impl->session, "Session should be initialised first");
  logging::debug("Reading RNG state");
  return _impl->session->getRNGState();
}

std::uint64_t Compiler::getRandomSeed() const {
  ERROR_ON_MSG(!_impl->session, "Session should be initialised first");
  logging::debug("Reading random seed");
  if (_impl->session->getIr().getRequiresRandomSeed()) {
    return _impl->session->getRandomSeed();
  }
  logging::debug("Session has no random behaviour: using 0 as seed.");
  return 0;
}

void Compiler::loadExecutableAndPrepareDevice(const char *import_filename) {
  ERROR_ON_MSG(!_impl->session, "Nothing to import. This may be because the "
                                "model does not run any op on an IPU.");

  const logging::LogContext ctx{"Compiler::loadExecutableAndPrepareDevice"};

  const std::string path(import_filename);
  auto stream = std::make_shared<std::ifstream>(path, std::ifstream::binary);
  ERROR_ON_MSG(!stream->is_open(), "Failed to open " << path << " for reading");
  _impl->session->loadExecutableFromStream(stream);
  // Don't automatically load the engine: we want to control when this happens
  // to make sure it happens at the same time in distributed environments.
  constexpr bool load_engine = false;
  _impl->session->prepareDevice(load_engine);
  _impl->cachePopartTypes();
}

void Compiler::loadEngineAndConnectStreams() {
  if (!_impl->session) {
    logging::trace("Skipping loading engine");
    return;
  }

  logging::trace("Loading engine");
  _impl->session->loadEngineAndConnectStreams();

  static const std::map<std::reference_wrapper<const poplar::Type>,
                        std::uint8_t, std::less<poplar::Type>>
      host_sizes{// word types
                 {poplar::UNSIGNED_INT, 4},
                 {poplar::INT, 4},
                 {poplar::FLOAT, 4},
                 // half types
                 {poplar::UNSIGNED_SHORT, 2},
                 {poplar::SHORT, 2},
                 {poplar::HALF, 2},
                 // byte types
                 {poplar::BOOL, 1},
                 {poplar::CHAR, 1},
                 {poplar::SIGNED_CHAR, 1},
                 {poplar::UNSIGNED_CHAR, 1}};

  // For each individual CPU operation (multiple calls to one op = still one op)
  for (detail::CallbackInternalMetadata &cb_data : _impl->callbacks) {
    // For each input we create a special callback which tracks how many inputs
    // have been added and once they're all in it calls back into python.
    const auto to_size_bytes = [&](const auto &shape, const auto &type) {
      const poplar::Type ptype =
          poptorch::popart_compiler::poplarTypeFromPoptorch(type);

      const auto it = host_sizes.find(ptype);
      ERROR_ON_MSG(it == host_sizes.cend(), "Unsupported host op type");

      const std::size_t number_of_elems = std::accumulate(
          shape.cbegin(), shape.cend(), 1, std::multiplies<std::size_t>());

      return number_of_elems * it->second;
    };

    // Store the amount of data to be transferred for each of the function's
    // input and output arguments.
    std::vector<std::size_t> input_sizes(cb_data.input_shapes.size());
    std::transform(cb_data.input_shapes.cbegin(), cb_data.input_shapes.cend(),
                   cb_data.input_types.begin(), input_sizes.begin(),
                   to_size_bytes);

    std::vector<std::size_t> output_sizes(cb_data.output_shapes.size());
    std::transform(cb_data.output_shapes.cbegin(), cb_data.output_shapes.cend(),
                   cb_data.output_types.begin(), output_sizes.begin(),
                   to_size_bytes);

    const auto poplar_callback =
        [input_sizes = std::move(input_sizes),
         output_sizes = std::move(output_sizes),
         &cb_data](const void *const *inputs, size_t number_of_inputs,
                   void *const *outputs, size_t number_of_outputs) {
          ERROR_ON_MSG(number_of_inputs != input_sizes.size(),
                       "Number of inputs does not match");
          ERROR_ON_MSG(number_of_outputs != output_sizes.size(),
                       "Number of outputs does not match");
          ERROR_ON_MSG(inputs == nullptr,
                       "CPU function callback given null inputs");
          ERROR_ON_MSG(outputs == nullptr,
                       "CPU function callback given null outputs");
          ERROR_ON_MSG(number_of_inputs != cb_data.input_pointers.size(),
                       "Number of inputs does not match cb data (got "
                           << cb_data.input_pointers.size() << ")");
          ERROR_ON_MSG(number_of_outputs != cb_data.output_pointers.size(),
                       "Number of outputs does not match cb data (got "
                           << cb_data.output_pointers.size() << ")");
          for (std::size_t input = 0; input < number_of_inputs; ++input) {
            // Copy from IPU into the waiting pytorch tensor on host.
            std::memcpy(reinterpret_cast<char *>(cb_data.input_pointers[input]),
                        reinterpret_cast<const char *>(inputs[input]),
                        input_sizes[input]);
          }
          // Call the pytorch function on CPU.
          cb_data.the_callback();

          // We then do the outputs, these are much simpler since it is a
          // straight up dependency free data copy.
          for (std::size_t output = 0; output < number_of_outputs; ++output) {
            std::memcpy(
                reinterpret_cast<char *>(outputs[output]),
                reinterpret_cast<const char *>(cb_data.output_pointers[output]),
                output_sizes[output]);
          }
        };

    // Tell poplar about the callback.
    _impl->session->connectHostFunction(cb_data.handle,
                                        std::move(poplar_callback));
  }
}

void Compiler::appendPoptorchMetadataToFile(
    const char *serialized_poptorch_metadata, const size_t metadata_length,
    const char *export_filename) {
  popef::Reader reader;
  reader.parseFile(export_filename);
  ERROR_ON_MSG(reader.executables().size() != 1,
               "Popef file does not contain exactly one Executable blob.");
  const std::string &executable_name = reader.executables().at(0).name;

  popef::FileWriter writer(export_filename, popef::FileWriter::Mode::APPEND);
  auto poptorch_blob =
      writer.createOpaqueBlob(poptorch_opaque_name, executable_name);
  poptorch_blob->stream.write(serialized_poptorch_metadata, metadata_length);
  poptorch_blob->close();
  writer.close();
}

std::vector<char>
Compiler::importPoptorchMetadataFromFile(const char *import_filename) {
  popef::Reader reader;
  reader.parseFile(import_filename);

  std::vector<popef::OpaqueReader> opaques = reader.opaqueBlobs();
  auto poptorch_blob_it = std::find_if(
      opaques.begin(), opaques.end(), [](const popef::OpaqueReader &opaque) {
        return opaque.name == poptorch_opaque_name;
      });
  ERROR_ON_MSG(poptorch_blob_it == opaques.end(),
               "Popef file does not contain Poptorch metadata.");

  const size_t buffer_size = poptorch_blob_it->getAvailableReadSize();
  std::vector<char> metadata_buffer(buffer_size);
  poptorch_blob_it->data.read(metadata_buffer.data(), buffer_size);

  return metadata_buffer;
}

void Compiler::compileAndPrepareDevice() {
  if (!_impl->session) {
    logging::trace("Skipping Poplar compilation");

    // This includes host side tensors, so has to be run even without a session.
    _impl->cachePopartTypes();

    return;
  }
  const logging::LogContext ctx_func{"Compiler::compileAndPrepareDevice"};

  // Poplar compilation.
  try {
    const logging::LogContext ctx{"popart::Session::prepareDevice: Poplar "
                                  "compilation"};
    logging::trace("Begining Poplar compilation.");
    constexpr bool load_engine = false;
    // Don't automatically load the engine: we want to control when this happens
    // to make sure it happens at the same time in distributed environments.
    _impl->session->prepareDevice(load_engine);
    logging::trace("Finished Poplar compilation.");
  } catch (popart::memory_allocation_err &e) {
    logging::err("Out of memory, the graph profile is available here: {}",
                 e.getProfilePath());
    std::rethrow_exception(std::current_exception());
  }

  _impl->cachePopartTypes();
}

std::unique_ptr<char[]> Compiler::getExecutionInfo() const {
  std::string as_string;
  switch (_impl->options.execution_mode) {
  case detail::ExecutionMode::Pipelined: {
    as_string = fmt::format(" mode(Pipelined), ipu({}), stage({})",
                            _impl->active_ipu, _impl->active_stage);
    break;
  }
  case detail::ExecutionMode::Sharded: {
    as_string = fmt::format(" mode(Sharded), ipu({}), stage({})",
                            _impl->active_ipu, _impl->active_stage);
    break;
  }
  case detail::ExecutionMode::Phased: {
    as_string = fmt::format(" mode(Phased), ipu({}), phase({})",
                            _impl->active_ipu, _impl->active_phase);
    break;
  }
  default:
    ERROR("Invalid ExecutionMode active");
  }

  // Copy into a memory managed array to get around ABI.
  return stringToUniquePtr(as_string);
}

std::unique_ptr<char[]> Compiler::getPopartIR() const {
  const std::string as_string = _impl->getPopartIR();

  // Copy into a memory managed array to get around ABI.
  return stringToUniquePtr(as_string);
}

std::set<std::unique_ptr<char[]>> Compiler::getTensorNames() const {
  std::set<std::unique_ptr<char[]>> casted_ids;

  const auto tensor_ids = _impl->getTensorNames();
  for (const auto &tensor_id : tensor_ids) {
    // Copy into a memory managed array to get around ABI.
    casted_ids.insert(stringToUniquePtr(tensor_id));
  }

  return casted_ids;
}

// Write the weights into IPU memory from the pytorch tensor buffers in the
// model.
void Compiler::copyWeightsToDevice(const std::vector<void *> &host_buffers) {
  if (!_impl->session) {
    logging::trace("Skipping writing weights from host to IPU memory.");
    return;
  }

  logging::info("Writing weights from host to IPU memory.");
  // Do we need to update the host buffers pointers before
  // uploading to the IPU?
  if (!host_buffers.empty()) {
    _impl->weights.updateData(host_buffers);
    _impl->session->writeWeights(_impl->weights);
  }
  _impl->session->weightsFromHost();
}

void Compiler::registerUpdatableNamedBuffer(const TensorId &id) {
  auto popart_id = _impl->ids.at(id);
  ERROR_ON_MSG(!_impl->weights.contains(popart_id),
               "Invalid updatable buffer " << popart_id);
  const auto &buffers = _impl->popart_options.updatableNamedBuffers;

  if (std::find(buffers.begin(), buffers.end(), popart_id) != buffers.end()) {
    const auto &weight = _impl->weights.weight(popart_id);
    _impl->updatable_named_buffers.registerParameter(popart_id, weight.info);
  }
}

// Write the buffers into IPU memory from the pytorch tensor buffers in the
// model.
void Compiler::copyNamedBuffersToDevice(
    const std::vector<void *> &host_buffers) {
  if (!_impl->session) {
    logging::trace("Skipping writing buffers from host to IPU memory.");
    return;
  }

  logging::info("Writing named buffers from host to IPU memory.");
  if (!host_buffers.empty()) {
    _impl->updatable_named_buffers.updateData(host_buffers);
    _impl->session->writeWeights(_impl->updatable_named_buffers);
  }
  _impl->session->buffersFromHost();
}

// Read the weights from IPU memory into the pytorch tensor buffers.
void Compiler::copyWeightsToHost(const std::vector<void *> &host_buffers) {
  if (!_impl->session) {
    logging::trace("Skipping writing weights from IPU to host.");
    return;
  }

  logging::info("Writing weights from IPU to host.");
  // In PopTorch we use copyWeightsToHost and copyWeightsToDevice as
  // synchronisation routines.
  // It means we expect to have one buffer on the host, one on the device and
  // to synchronise the two in one direction or the other.
  //
  // PopART works differently: it has one set of read source buffers and one
  // set of write destination buffers and we need to keep those in sync
  // manually by calling writeWeights()

  // Transfer from the IPU to PopART read source buffers.
  _impl->session->weightsToHost();
  // Update the Poptorch destination buffers
  _impl->weights.updateData(host_buffers);
  // Copy from the PopART read source buffers to the Poptorch buffers.
  _impl->session->readWeights(_impl->weights);
  // Keep the PopART write destination buffer in sync with the PopTorch buffer.
  _impl->session->writeWeights(_impl->weights);
}

void Compiler::updateOptimizers(const std::vector<Optimizer> &optimizers) {
  ERROR_ON(!_impl->session);
  ERROR_ON(optimizers.empty());
  ERROR_ON(!_impl->is_training);

  // Each of the groups of parameters are stored in a single PopART
  // optimizer that's why the vector of optimizers translates into
  // a single PopART optimizer.
  const std::unique_ptr<popart::Optimizer> optimizer =
      _impl->getPopartOptimizer(optimizers);

  // Update the popart graph/poplar executable with new optimizer.
  popart::TrainingSession &session =
      dynamic_cast<popart::TrainingSession &>(*_impl->session);
  session.updateOptimizerFromHost(optimizer.get());
}

void Compiler::run() {
  if (!_impl->session) {
    // Nothing to run on IPU
    ERROR_ON(!_impl->popart_incoming.empty());
    ERROR_ON(!_impl->popart_outgoing.empty());
    ERROR_ON(!_impl->outgoing_duplicates.empty());
    ERROR_ON(!_impl->memory_manager.empty());
    return;
  }

  if (!isAttachedToDevice()) {
    attachToDevice();
  }

  _impl->stepio.setInputGroupings(_impl->options.input_cgt,
                                  _impl->options.input_group_size,
                                  _impl->popart_options.replicatedGraphCount);

  // Execute the model on IPU.
  _impl->stepio.populate(_impl->popart_incoming, _impl->popart_outgoing);
  _impl->session->run(_impl->stepio);

  // In case several outputs point at the same tensor: duplicate the data
  for (const auto &out : _impl->outgoing_duplicates) {
    auto &src = _impl->popart_outgoing.at(out.first);
    for (auto *ptr : out.second) {
      std::memcpy(ptr, src.data(),
                  src.nelms() *
                      popart::getDataTypeInfoMap().at(src.dataType()).nbytes());
    }
  }
  // The buffers handle the communication between pytorch and popart, we set
  // them up each run.
  _impl->popart_incoming.clear();
  _impl->popart_outgoing.clear();
  _impl->outgoing_duplicates.clear();
  _impl->memory_manager.clear();

  // Log the number of cycles if instrumentation is enabled
  const popart::SessionOptions &options = _impl->popart_options;
  if (options.instrumentWithHardwareCycleCounter) {
    _cycle_count = _impl->session->getCycleCount();
    logging::debug("Total number of IPU cycles: {}", _cycle_count);
  }
}

PopartType Compiler::getPopartType(TensorId id) const {
  return _impl->getPopartType(id);
}

const char *Compiler::tensorName(TensorId id) const {
  return _impl->ids.at(id).c_str();
}

bool Compiler::tensorIdIsValid(TensorId id) const {
  return id < _impl->ids.size();
}

const std::vector<std::int64_t> Compiler::invalid_size{-1};
std::vector<std::int64_t> Compiler::getSize(TensorId id) const {
  if (isHostSideConstant(id)) {
    return _impl->getHostSideConstant(id).shape();
  }

  if (_impl->session) {
    return _impl->session->getInfo(_impl->ids[id]).shape();
  }

  const auto popart_id = _impl->ids.at(id);

  if (!_impl->active_builder->hasValueInfo(popart_id)) {
    return invalid_size;
  }
  return _impl->active_builder->getTensorShape(popart_id);
}

std::unique_ptr<char[]> Compiler::getTensorDTypeString(TensorId id) const {
  std::string type_str;

  if (_impl->session) {
    type_str = _impl->session->getInfo(_impl->ids[id]).data_type();
  } else {
    const auto popart_id = _impl->ids.at(id);
    if (_impl->active_builder->hasValueInfo(popart_id)) {
      type_str = _impl->active_builder->getTensorDtypeString(popart_id);
    } else {
      type_str = "unknown";
    }
  }

  return stringToUniquePtr(type_str);
}

void Compiler::setCurrentPythonCodeLocation(const char *torch_node,
                                            const char *filename,
                                            std::uint64_t line,
                                            std::uint64_t col) {
  UNUSED(col);
  _impl->torch_node = torch_node;
  _impl->code_location = popart::SourceLocation("", filename, line);
}

void Compiler::clearActiveIpu() { _impl->active_ipu = -1; }

void Compiler::setActiveIpu(std::uint64_t stage_id, std::int64_t phase_id,
                            std::int64_t ipu_id) {
  switch (_impl->options.execution_mode) {
  case detail::ExecutionMode::Phased:
    ERROR_ON_MSG(phase_id < 0, "Invalid phase for ExecutionMode::Phased");
    if (_impl->options.tensors_liveness ==
        detail::Liveness::OffChipAfterEachPhase) {
      ERROR_ON_MSG(!_impl->options.serial_phases_execution,
                   "This is only supported for serial phase execution");
      _impl->active_phase = phase_id * 4;
    } else if (_impl->options.tensors_liveness ==
               detail::Liveness::OffChipAfterFwdNoOverlap) {
      ERROR_ON_MSG(!_impl->options.serial_phases_execution,
                   "This is only supported for serial phase execution");
      _impl->active_phase = phase_id * 2;
    } else {
      _impl->active_phase = phase_id;
    }
    _impl->max_phase = std::max(_impl->active_phase, _impl->max_phase);
    if (!_impl->options.serial_phases_execution) {
      ERROR_ON_MSG(_impl->active_phase % 2 != ipu_id % 2,
                   "When phases are executed in parallel: even phases must run "
                   "on even IPUs and odd phases on odd IPUs");
    }
    break;
  case detail::ExecutionMode::Pipelined:
  case detail::ExecutionMode::Sharded:
    _impl->active_stage = stage_id;
    break;
  default:
    ERROR("Unsupported ExecutionMode");
  }

  // Record a number of times the IPU switches as this is needed to calculate
  // number of pipeline stages.
  if (static_cast<uint64_t>(ipu_id) != _impl->last_ipu_used) {
    _impl->num_ipu_switches++;
  }

  _impl->active_ipu = ipu_id;

  // The previous will revert to -1 but this will remain ipu_id until another
  // IPU is used.
  _impl->last_ipu_used = ipu_id;
}

bool Compiler::isHostSideConstant(TensorId id) const {
  return _impl->isHostSideConstant(id);
}

std::uint64_t Compiler::batchPerStep() const { return _impl->options.steps; }

std::uint64_t Compiler::popartBatchDim() const {
  return _impl->popart_options.replicatedGraphCount * _impl->options.steps *
         _impl->popart_options.accumulationFactor;
}

std::uint64_t Compiler::popartBatchDimForAnchor(TensorId id) const {
  if (isHostSideConstant(id)) {
    return 1; // Cannot be batched as it is a constant
  }

  // Get the PopART tensor from our wrapper.
  const popart::TensorId &popart_id = _impl->ids[id];

  // Check what the anchor is supposed to return.
  const auto iterator = _impl->anchors.find(popart_id);
  ERROR_ON_MSG(iterator == _impl->anchors.cend(),
               "Internal Error: Output op doesn't have an anchor.");

  const popart::AnchorReturnType &return_type = iterator->second;

  // If we are returning ALL then we are returning a full batch.
  if (return_type.id() == popart::AnchorReturnTypeId::All) {
    return popartBatchDim();
  }

  // If we are copying EveryN then we will be returning N.
  if (return_type.id() == popart::AnchorReturnTypeId::EveryN) {
    return popartBatchDim() / return_type.rp();
  }

  // Return an element for each replica.
  return _impl->popart_options.replicatedGraphCount;
}

void Compiler::setAvailableMemoryProportion(
    const std::vector<std::set<TensorId>> &inputs,
    float availableMemoryProportion) {
  for (const auto &ids : inputs) {
    std::set<popart::TensorId> popart_ids;
    std::transform(std::cbegin(ids), std::cend(ids),
                   std::inserter(popart_ids, std::begin(popart_ids)),
                   [this](const TensorId &id) { return _impl->ids[id]; });
    _impl->active_builder->setAvailableMemoryProportion(
        popart_ids, availableMemoryProportion);
  }
}

void Compiler::setMatMulSerialization(TensorId matmul, const char *mode,
                                      std::uint64_t factor,
                                      std::uint64_t keep_precision) {
  _impl->active_builder->setSerializeMatMul({_impl->ids[matmul]}, mode, factor,
                                            keep_precision != 0u);
}

void Compiler::optimizerGroup(const std::vector<TensorId> &inputs,
                              int64_t group) {
  _impl->optimizerGroup(inputs, group);
}

std::vector<TensorMetadata> Compiler::optimizerTensorMetadataList() const {
  std::vector<TensorMetadata> metadata_list;
  auto fn_add_tensor_data = [&](popart::Tensor *t, bool state_tensor) {
    TensorMetadata tm;
    tm.id = t->id.c_str();

    popart::TensorInfo ti(t->info);

    const auto global_replication_factor =
        _impl->session->getDevice().getGlobalReplicationFactor();
    // obtain real tensor shape that is taking into account replication and
    // replica grouping
    ti.set(ti.dataType(), t->getVariableSettings().shapeOnHost(
                              t->info.shape(), global_replication_factor));

    tm.shape = ti.shape();
    tm.dtype = ti.data_type().c_str();

    // Optimiser state tensors are variables in PopART, and must be read/written
    // via WeightsIO. Optimiser parameters such as learning rate and loss
    // scaling are either stream or constant tensors, and so can be read/written
    // directly via memcpy
    if (state_tensor) {
      if (!_impl->optim_state_tensors.contains(t->id)) {
        _impl->optim_state_tensors.registerParameter(t->id, ti);
      }
    } else {
      tm.data = t->tensorData()->data();
      tm.num_bytes = t->info.nbytes();
    }
    metadata_list.push_back(std::move(tm));
  };
  for (auto *t : _impl->session->getIr().optimizerStateTensors()) {
    fn_add_tensor_data(t, true);
  }
  // Note: session->getIr().optimizerTensors() is empty for cached executables,
  // so get the optimizer tensors from the executable instead.
  for (auto *t : _impl->session->getExecutable().getOptimizerTensors()) {
    fn_add_tensor_data(t, false);
  }
  return metadata_list;
}

void Compiler::fillHostOptimizerStateTensorData(
    const std::vector<void *> &host_buffers) {
  logging::info("Writing optimiser state tensors from IPU to host.");
  // In PopTorch we use copyWeightsToHost and copyWeightsToDevice as
  // synchronisation routines.
  // It means we expect to have one buffer on the host, one on the device and
  // to synchronise the two in one direction or the other.
  //
  // PopART works differently: it has one set of read source buffers and one
  // set of write destination buffers and we need to keep those in sync
  // manually by calling writeWeights()

  // Transfer from the IPU to PopART read source buffers.
  _impl->session->weightsToHost();
  // Update the Poptorch destination buffers
  _impl->optim_state_tensors.updateData(host_buffers);
  // Copy from the PopART read source buffers to the Poptorch buffers.
  _impl->session->readWeights(_impl->optim_state_tensors);
  // Keep the PopART write destination buffer in sync with the PopTorch buffer.
  _impl->session->writeWeights(_impl->optim_state_tensors);
}

void Compiler::writeDeviceOptimizerStateTensorData(
    const std::vector<void *> &host_buffers) {
  ERROR_ON_MSG(!_impl->session, "Session should be initialised first");
  ERROR_ON_MSG(!isAttachedToDevice(), "Must be attached to a device to "
                                      "write the optimizer state.");
  logging::info("Writing optimiser state tensors from host to IPU memory.");
  _impl->optim_state_tensors.updateData(host_buffers);
  _impl->session->writeWeights(_impl->optim_state_tensors);
  _impl->session->weightsFromHost();
}

Compiler::Compiler(Compiler &&compiler) : _cycle_count(compiler._cycle_count) {
  _impl = std::move(compiler._impl);
}

Compiler::Compiler(bool is_training, const SessionOptions &options)
    : _cycle_count(no_cycles) {
  _impl = std::make_unique<detail::CompilerImpl>();
  _impl->is_training = is_training;
  _impl->popart_options = options._impl->popart_options;
  _impl->options = options._impl->poptorch_options;
  _impl->options_set = options._impl->options_set;
}

Compiler::~Compiler() = default;

void Compiler::addOutputType(OutputTypeShape type) {
  _impl->output_types.emplace_back(type);
}

const std::vector<OutputTypeShape> &Compiler::outputTypes() const {
  return _impl->output_types;
}

void Compiler::startSubgraph() {
  popart::Builder *subgraph = &_impl->active_builder->createSubgraphBuilder();
  _impl->active_builder = subgraph;

  _impl->active_builder->addInputTensor(
      popart::TensorInfo{"INT64", popart::Shape{}});
  const popart::TensorId keep_going = _impl->active_builder->addInputTensor(
      popart::TensorInfo{"BOOL", popart::Shape{}});

  _impl->active_builder->addOutputTensor({keep_going});
}

void Compiler::setAttribute(const char *attribute, const char *key,
                            const char *value) {
  _impl->setAttribute(std::string(attribute), std::string(key),
                      std::string(value));
}

void Compiler::clearAttribute(const char *attribute, const char *key) {
  _impl->clearAttribute(std::string(attribute), std::string(key));
}

TensorId Compiler::endForLoop(std::int32_t trip_count, std::int64_t num_outputs,
                              const std::vector<TensorId> &inputs) {
  ERROR_ON_MSG(_impl->is_training,
               "poptorch.for_loop() is only supported in inference.");

  popart::Builder *body = _impl->active_builder;

  // Switch back to main graph.
  _impl->active_builder = _impl->active_builder->getParent();
  auto ai_onnx = _impl->active_builder->aiOnnxOpset11();

  const popart::ConstVoidData trip_count_data(&trip_count,
                                              {"INT32", popart::Shape{}});

  const bool true_const = true;
  const popart::ConstVoidData the_data(&true_const, {"BOOL", popart::Shape{}});

  const popart::TensorId trip_count_as_tensor =
      ai_onnx.constant(trip_count_data);
  const popart::TensorId condition = ai_onnx.constant(the_data);

  std::vector<popart::TensorId> transformed_ins = {trip_count_as_tensor,
                                                   condition};

  for (const TensorId id : inputs) {
    transformed_ins.push_back(_impl->ids[id]);
  }

  std::vector<popart::TensorId> output =
      ai_onnx.loop(transformed_ins, num_outputs, *body);

  return HandleOutput<std::vector<popart::TensorId>>{}(output, false,
                                                       _impl.get());
}

void Compiler::startIfBlock() {
  popart::Builder *subgraph = &_impl->active_builder->createSubgraphBuilder();
  _impl->active_builder = subgraph;
  _impl->if_true_stack.push(_impl->active_builder);
}

void Compiler::startElseBlock() {
  // Else must by definition be added after an if block.
  _impl->active_builder = _impl->active_builder->getParent();

  popart::Builder *subgraph = &_impl->active_builder->createSubgraphBuilder();
  _impl->active_builder = subgraph;
  _impl->if_false_stack.push(_impl->active_builder);
}

TensorId Compiler::endIfBlock(const TensorId &condition,
                              std::size_t num_outputs) {
  ERROR_ON_MSG(_impl->is_training,
               "poptorch.cond() is only supported in inference.");

  // Pop back to the parent.
  _impl->active_builder = _impl->active_builder->getParent();

  // Pop the false branch off the stack.
  popart::Builder *else_branch = _impl->if_false_stack.top();
  _impl->if_false_stack.pop();

  // Pop the true branch off the stack.
  popart::Builder *then_branch = _impl->if_true_stack.top();
  _impl->if_true_stack.pop();

  const popart::TensorId cond_as_popart = _impl->ids.at(condition);

  auto ai_onnx = _impl->active_builder->aiOnnxOpset11();
  std::vector<popart::TensorId> outputs = ai_onnx.logical_if(
      {cond_as_popart}, num_outputs, *else_branch, *then_branch);

  return HandleOutput<std::vector<popart::TensorId>>{}(outputs, false,
                                                       _impl.get());
}

void Compiler::pushNameScope(const char *name) {
  _impl->active_builder->pushNameScope(std::string(name));
}

void Compiler::popNameScope() { _impl->active_builder->popNameScope(); }

TensorId Compiler::addUntypedInputTensor() {
  const popart::TensorId out = _impl->active_builder->addUntypedInputTensor();
  _impl->ids.push_back(out);
  return _impl->ids.size() - 1;
}

void Compiler::assertTensorIs(PopartType dataType, TensorId id) const {
  const PopartType actual_type = _impl->ids_types.at(id);

  if (__builtin_expect(
          static_cast<std::int64_t>(actual_type == PopartType::UNDEFINED), 0) !=
      0) {
    // Rare case of input tensor never used, so not in IR
    return;
  }

  ERROR_ON_MSG(actual_type != dataType,
               "One or more input data types have changed since the first model"
               " run. You will need to call \"destroy\" on the model before "
               "running with different input data types.");
}

void Compiler::addMultiConvPart(const std::vector<TensorId> &inputs,
                                const std::vector<int64_t> &dilations,
                                const std::vector<int64_t> &kernel_shape,
                                const std::vector<int64_t> &pads,
                                const std::vector<int64_t> &strides) {
  std::vector<popart::TensorId> args;
  std::transform(inputs.cbegin(), inputs.cend(), std::back_inserter(args),
                 [&](TensorId index) { return _impl->ids[index]; });
  _impl->addMultiConvPart(args, dilations, kernel_shape, pads, strides);
}

void Compiler::setMultiConvAvailableMemoryProportions(
    const std::vector<double> &v) {
  ERROR_ON_MSG(
      _impl->multi_conv_builder == nullptr,
      "Unexpected poptorch.MultiConv option: available_memory_proportions");
  _impl->multi_conv_builder->setAvailableMemoryProportions(
      popart::vXtoY<double, float>(v));
}

void Compiler::setMultiConvPartialsTypes(
    const std::vector<int64_t> &partials_types) {
  ERROR_ON_MSG(_impl->multi_conv_builder == nullptr,
               "Unexpected poptorch.MultiConv option: partials_types");
  _impl->multi_conv_builder->setPartialsTypes(partials_types);
}

void Compiler::setMultiConvEnableConvDithering(
    const std::vector<int64_t> &conv_ditherings) {
  ERROR_ON_MSG(_impl->multi_conv_builder == nullptr,
               "Unexpected poptorch.MultiConv option: enable_conv_dithering");
  _impl->multi_conv_builder->setEnableConvDithering(conv_ditherings);
}

void Compiler::setMultiConvPlanType(int64_t plan_type) {
  ERROR_ON_MSG(_impl->multi_conv_builder == nullptr,
               "Unexpected poptorch.MultiConv option: plan_type");
  _impl->multi_conv_builder->setPlanType(plan_type);
}

void Compiler::setMultiConvPerConvReservedTiles(int64_t v) {
  ERROR_ON_MSG(_impl->multi_conv_builder == nullptr,
               "Unexpected poptorch.MultiConv option: per_conv_reserved_tiles");
  _impl->multi_conv_builder->setPerConvReservedTiles(static_cast<int>(v));
}

void Compiler::setMultiConvCycleBackOff(double c) {
  ERROR_ON_MSG(_impl->multi_conv_builder == nullptr,
               "Unexpected poptorch.MultiConv option: cycle_back_off");
  _impl->multi_conv_builder->setCycleBackOff(static_cast<float>(c));
}

std::vector<TensorId> Compiler::endMultiConv() {
  auto outputs = _impl->endMultiConv();
  const TensorId first =
      HandleOutput<decltype(outputs)>{}(outputs, false, _impl.get());
  std::vector<TensorId> out_ids(outputs.size());
  std::iota(out_ids.begin(), out_ids.end(), first);
  return out_ids;
}

TensorId
Compiler::addCPUCallback(const std::vector<TensorId> &inputs,
                         const CallbackMetadata &callback,
                         std::vector<PopartType> input_types,
                         std::vector<std::vector<std::size_t>> input_shapes,
                         std::vector<PopartType> output_types,
                         std::vector<std::vector<std::size_t>> output_shapes) {
  const logging::LogContext ctx{"Compiler::addCPUCallback"};
  logging::trace("Starting CPU callback adding");

  // Usual poptorch -> popart tensor conversion/lookup.
  std::vector<popart::TensorId> ins;
  ins.reserve(inputs.size());
  std::transform(inputs.begin(), inputs.end(), std::back_inserter(ins),
                 [&](TensorId index) { return _impl->ids[index]; });

  // Populate the metadata structure which will be used to communicate between
  // all the components involved in running the host op.
  _impl->callbacks.emplace_front();
  detail::CallbackInternalMetadata &metadata = _impl->callbacks.front();

  // Python function we're calling.
  metadata.the_callback = callback.the_callback;

  // Pointers to the waiting python buffers.
  metadata.input_pointers = callback.input_pointers;
  metadata.output_pointers = callback.output_pointers;

  // A tracker so we can see how many streams have been inited by the poplar
  // buffer callback so we can call the python callback once it equals the
  // number of inputs.
  metadata.number_of_input_streams_inited = 0;

  // Used to mangle the name.
  detail::CallbackInternalMetadata::number_of_added_ops++;

  // Create an ID for each op so we can give a unique name to poplar for each
  // output/input.
  metadata.handle =
      "poptorch.host_op_" +
      std::to_string(detail::CallbackInternalMetadata::number_of_added_ops);

  metadata.input_types = std::move(input_types);
  metadata.input_shapes = std::move(input_shapes);
  metadata.output_types = std::move(output_types);
  metadata.output_shapes = std::move(output_shapes);

  std::map<std::string, popart::any> attributes_map;

  // We have to smuggle this through as a pointer as popart attribute map
  // doesn't support generic types.
  detail::CallbackInternalMetadata *as_ptr = &metadata;
  const std::intptr_t as_int = reinterpret_cast<std::intptr_t>(as_ptr);

  const std::int64_t to_int64 = static_cast<std::int64_t>(as_int);

  logging::trace("Add CPU callback has added pointer {}", to_int64);
  attributes_map.insert({poptorch_custom_ops::host_op_metadata_attr, to_int64});

  std::vector<popart::TensorId> output = _impl->active_builder->customOp(
      poptorch_custom_ops::host_op, 1, ins, metadata.output_types.size(),
      attributes_map);

  // Convert the popart tensors back to poptorch tensors.
  return HandleOutput<decltype(output)>{}(output, false, _impl.get());
}

std::uint32_t detail::CallbackInternalMetadata::number_of_added_ops = 0;

void Compiler::detachFromDevice() { _impl->detachFromDevice(); }

void Compiler::attachToDevice() { _impl->attachToDevice(); }

bool Compiler::isAttachedToDevice() const {
  return _impl->isAttachedToDevice();
}

Timestamps Compiler::getTimestamps() const {
  const auto num_inputs = getNumInputs();
  const auto num_outputs = getNumOutputs();

  Timestamps ts;
  ts.input.reserve(num_inputs);
  ts.input_complete.reserve(num_inputs);
  ts.output.reserve(num_outputs);
  ts.output_complete.reserve(num_outputs);

  for (size_t i = 0; i < num_inputs; i++) {
    const auto id = _impl->inputs[i];
    ts.input.push_back(_impl->stepio.getInputTimestamps(id));
    ts.input_complete.push_back(_impl->stepio.getInputCompleteTimestamps(id));
  }
  for (size_t i = 0; i < num_outputs; i++) {
    const auto id = _impl->outputs[i];
    ts.output.push_back(_impl->stepio.getOutputTimestamps(id));
    ts.output_complete.push_back(_impl->stepio.getOutputCompleteTimestamps(id));
  }

  return ts;
}

uint64_t Compiler::getCycleCount() const {
  if (_cycle_count != no_cycles) {
    return _cycle_count;
  }

  ERROR_ON_MSG(!_impl->popart_options.instrumentWithHardwareCycleCounter,
               "Cycle count logging is disabled.");

  ERROR("Please run the model at least once before obtaining cycle count.");
}

size_t Compiler::getNumInputs() const { return _impl->inputs.size(); }

size_t Compiler::getNumOutputs() const { return _impl->outputs.size(); }

void Compiler::verifySettingsForOverlappedIO(PopartOutputMode output_mode) {
  if (_impl->options.execution_mode == detail::ExecutionMode::Pipelined) {
    ERROR("Overlapped IO is not supported with poptorch.PipelinedExecution. "
          "If you are using only one IPU, please switch to "
          "poptorch.ShardedExecution.");
  }

  ERROR_ON_MSG(_impl->popart_options.numIOTiles == 0,
               "No IO tiles allocated. You must allocate at least 32 IO tiles "
               "using poptorch.Options().TensorLocations.numIOTiles.");

  if (output_mode != PopartOutputMode::Sum &&
      output_mode != PopartOutputMode::All) {
    ERROR("Unsupported output mode for overlapped IO. Please switch output "
          "mode to poptorch.OutputMode.All or poptorch.OutputMode.Sum.");
  }
}

void setPopartLogLevel(logging::Level level) {
  for (uint64_t module = 0;
       module < static_cast<uint64_t>(popart::logging::Module::none);
       module++) {
    popart::logging::setLogLevel(static_cast<popart::logging::Module>(module),
                                 static_cast<popart::logging::Level>(level));
  }
}

void throwTestError(TestErrorType type) {
  const logging::LogContext ctx_top{"throwTestError::topLevel"};
  {
    const logging::LogContext ctx{"throwTestError::bottomLevel"};
    switch (type) {
    case TestErrorType::Poptorch: {
      ERROR("This is a PopTorch error");
    }
    case TestErrorType::Popart: {
      throw popart::error("This is a Popart error");
    }
    case TestErrorType::PopartInternal: {
      throw popart::internal_error("This is a Popart error");
    }
    case TestErrorType::Poplibs: {
      throw poputil::poplibs_error("This is a Poplibs error");
    }
    case TestErrorType::PoplarUnrecoverable: {
      throw poplar::unrecoverable_runtime_error("This is not recoverable");
    }
    case TestErrorType::PoplarUnknown: {
      throw poplar::unknown_runtime_error("Don't know what happened");
    }
    case TestErrorType::PoplarRecoverableFullReset: {
      throw poplar::recoverable_runtime_error(
          poplar::RecoveryAction::FULL_RESET, "Reboot needed");
    }
    case TestErrorType::PoplarLinkError: {
      throw poplar::link_error("Link error",
                               "Library -lfoo not found\ncheck path");
    }
    default: {
      break;
    }
    }
  }
  ERROR("Unknown TestErrorType");
}

namespace {
class PopExceptionInfo : public ExceptionInfo {
public:
  ~PopExceptionInfo() override = default;
  const char *what() const noexcept override;
  const char *type() const override;
  int64_t stackDepth() const override;
  const char *stack(int64_t level) const override;
  const char *filename() const override;
  uint64_t line() const override;
  const char *recoveryAction() const override;
  ErrorCategory category() const override;
  void extractStack(const popart::error &e);

  std::string mwhat;
  std::string mtype;
  std::vector<std::string> mstack;
  std::string mfilename;
  uint64_t mline;
  std::string mrecovery_action;
  ErrorCategory mcategory;
};

const char *PopExceptionInfo::what() const noexcept { return mwhat.c_str(); }

const char *PopExceptionInfo::type() const { return mtype.c_str(); }

int64_t PopExceptionInfo::stackDepth() const { return mstack.size(); }

const char *PopExceptionInfo::stack(int64_t level) const {
  return mstack.at(level).c_str();
}

const char *PopExceptionInfo::filename() const { return mfilename.c_str(); }

uint64_t PopExceptionInfo::line() const { return mline; }

const char *PopExceptionInfo::recoveryAction() const {
  return mrecovery_action.c_str();
}

ErrorCategory PopExceptionInfo::category() const { return mcategory; }

void PopExceptionInfo::extractStack(const popart::error &e) {
  std::istringstream iss(e.stackreport());
  std::string l;
  // PopART adds a numbered prefix to each stack line: remove it:
  // [0] top_level_fn()
  // [1] main()
  //
  // Becomes:
  //
  // top_level_fn()
  // main()
  while (std::getline(iss, l)) {
    size_t first_space = l.find_first_of(' ');
    if (first_space == std::string::npos) {
      first_space = 0;
    } else {
      // Start at the first character after the space
      ++first_space;
    }
    mstack.push_back(l.substr(first_space));
  }
}
} // namespace

void rethrowPopartOrPoplarException(const std::exception_ptr &eptr,
                                    const char *filename, uint64_t line) {
  PopExceptionInfo pei;
  pei.mfilename = logging::shortPoptorchFilename(filename);
  pei.mline = line;
  pei.mcategory = ErrorCategory::Other;
  const std::string extra_info;
  try {
    std::rethrow_exception(eptr);
  } catch (const popart::internal_error &ex) {
    pei.mwhat = ex.what();
    pei.mtype = "popart_internal_exception";
    pei.extractStack(ex);
  } catch (const popart::error &ex) {
    pei.mwhat = ex.what();
    pei.mtype = "popart_exception";
    pei.extractStack(ex);
  } catch (const poplar::link_error &ex) {
    // Note: for some reason this error doesn't set its type in Poplar
    pei.mwhat = ex.what();
    pei.mwhat += ". Output: " + ex.output;
    pei.mtype = "poplar_link_error";
  } catch (const poplar::recoverable_runtime_error &ex) {
    pei.mwhat = ex.what();
    pei.mtype = "poplar_";
    pei.mtype += ex.type;
    pei.mcategory = ErrorCategory::RuntimeRecoverable;
    pei.mrecovery_action = poplar::toString(ex.getRecoveryAction());
  } catch (const poplar::unrecoverable_runtime_error &ex) {
    pei.mwhat = ex.what();
    pei.mtype = "poplar_";
    pei.mtype += ex.type;
    pei.mcategory = ErrorCategory::RuntimeUnrecoverable;
  } catch (const poplar::poplar_error &ex) {
    pei.mwhat = ex.what();
    pei.mtype = "poplar_";
    pei.mtype += ex.type;
  } catch (const poputil::poplibs_error &ex) {
    pei.mwhat = ex.what();
    pei.mtype = "poplibs_exception";
  } catch (...) {
    return;
  }
  throw pei;
}

} // namespace popart_compiler
} // namespace poptorch


================================================
FILE: popart_compiler/source/CompilerImpl.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#include <chrono>
#include <memory>
#include <numeric>
#include <vector>

#include <spdlog/fmt/fmt.h>
#include <spdlog/fmt/ostr.h>

#include <popart/adam.hpp>
#include <popart/adaptive.hpp>
#include <popart/builder.hpp>
#include <popart/error.hpp>
#include <popart/half.hpp>
#include <popart/ir.hpp>
#include <popart/op/convbase.hpp>
#include <popart/op/identity.hpp>
#include <popart/op/matmul.hpp>
#include <popart/op/nll.hpp>
#include <popart/optimizer.hpp>
#include <popart/popx/devicex.hpp>
#include <popart/popx/devicexmanager.hpp>
#include <popart/session.hpp>
#include <popart/sgd.hpp>
#include <popart/tensorinfo.hpp>
#include <popart/tensors.hpp>
#include <poprithms/ndarray/unfold.hpp>
#include <poptorch_logging/Error.hpp>

#include "popart_compiler/Compiler.hpp"
#include "popart_compiler/CompilerImpl.hpp"
#include "popart_compiler/CompilerOptions.hpp"
#include "popart_compiler/MultiConvBuilder.hpp"
#include "popart_compiler/PopartEnums.hpp"
#include "popart_compiler/Utils.hpp"

namespace poptorch {
namespace popart_compiler {
namespace {

std::string toString(const std::vector<std::string> &vec) {
  std::stringstream ss;
  ss << "[";
  std::string sep{};
  for (const auto &s : vec) {
    ss << sep << s;
    sep = ", ";
  }
  ss << "]";
  return ss.str();
}

std::string toString(OptimizerType type) {
  switch (type) {
  case OptimizerType::SGD1:
    return "SGD1";
  case OptimizerType::SGD2:
    return "SGD2";
  case OptimizerType::LAMB:
  case OptimizerType::LAMB_NO_BIAS:
    return "LAMB";
  case OptimizerType::ADAM:
    return "ADAM";
  case OptimizerType::ADAMW:
  case OptimizerType::ADAMW_NO_BIAS:
    return "ADAMW";
  case OptimizerType::RMSPROP_CENTERED:
  case OptimizerType::RMSPROP:
    return "RMSPROP";
  default:
    ERROR("Unreachable: Unsupported optimizer.");
  }
}

// If is_default: return the list of keys accepted by the
// `const std::map<std::string, std::pair<float, bool>> &params` parameter
// of the Popart constructor: it is usually the list of OptimizerValue
// accepted by the explicit constructor.
//
// Else: return the list of keys accepted by insertSpecific (It's usually
// defined in the optimizer's cpp file in a function called getSpecificNames()
// TODO(T33686): these names should be provided by PopART.
std::vector<std::string> getAttributeNames(OptimizerType type,
                                           bool is_default) {
  switch (type) {
  case OptimizerType::SGD1:
  case OptimizerType::SGD2: {
    if (is_default) {
      return {
          "defaultLearningRate", "defaultWeightDecay",     "defaultMomentum",
          "defaultDampening",    "defaultVelocityScaling", "nesterov",
          "lossScaling"};
    }
    return {"learningRate", "weightDecay",     "momentum",
            "dampening",    "velocityScaling", "nesterov"};
  }
  case OptimizerType::LAMB:
  case OptimizerType::LAMB_NO_BIAS: {
    if (is_default) {
      return {"defaultLearningRate", "defaultWeightDecay",
              "defaultBeta1",        "defaultBeta2",
              "defaultEps",          "defaultMaxWeightNorm",
              "lossScaling"};
    }
    return {"learningRate", "weightDecay", "beta1",
            "beta2",        "eps",         "maxWeightNorm"};
  }
  case OptimizerType::ADAM:
  case OptimizerType::ADAMW:
  case OptimizerType::ADAMW_NO_BIAS: {
    if (is_default) {
      return {"defaultLearningRate", "defaultWeightDecay", "defaultBeta1",
              "defaultBeta2",        "defaultEps",         "lossScaling"};
    }
    return {"learningRate", "weightDecay", "beta1", "beta2", "eps"};
  }
  case OptimizerType::RMSPROP_CENTERED:
  case OptimizerType::RMSPROP: {
    if (is_default) {
      return {"defaultLearningRate", "defaultWeightDecay", "defaultAlpha",
              "defaultMomentum",     "defaultEps",         "lossScaling"};
    }
    return {"learningRate", "weightDecay", "alpha", "momentum", "eps"};
  }
  default:
    ERROR("Unreachable: Unsupported optimizer.");
  }
}

int indexOf(const std::vector<std::string> &vec, const std::string &v) {
  auto it = std::find(vec.begin(), vec.end(), v);
  if (it == vec.end()) {
    return -1;
  }
  return it - vec.begin();
}

std::vector<std::string> vectorDiff(const std::vector<std::string> &provided,
                                    const std::vector<std::string> &expected) {
  std::vector<std::string> missing;
  for (const auto &exp : expected) {
    if (indexOf(provided, exp) < 0) {
      missing.push_back(exp);
    }
  }
  return missing;
}

// Convert a Poptorch Optimizer into a map of parameters + types that
// can be understood by the Popart Optimizer / insertSpecific.
struct OptimizerParameters {
public:
  OptimizerParameters(const Optimizer &opt, bool is_default);
  std::string debug() const;
  OptimizerType type;
  bool accum_types_provided;
  popart::DataType accum_type;
  popart::DataType first_order_momentum_accum_type;
  popart::DataType second_order_momentum_accum_type;
  bool use_tf_variant;
  float max_grad_norm;
  std::map<std::string, std::pair<float, bool>> params;
};

std::string OptimizerParameters::debug() const {
  std::stringstream ss;
  ss << toString(type);
  for (const auto &p : params) {
    ss << ", " << p.first << "=" << p.second.first;
    if (p.second.second) {
      ss << " (const)";
    }
  }
  if (accum_types_provided) {
    ss << ", accumType=" << accum_type;
    ss << ", firstOrderMomentumAccumType=" << first_order_momentum_accum_type;
    ss << ", secondOrderMomentumAccumType=" << second_order_momentum_accum_type;
  }
  ss << ", useTfVariant=" << use_tf_variant;
  ss << ", maxGradNorm=" << max_grad_norm;
  return ss.str();
}

OptimizerParameters::OptimizerParameters(const Optimizer &opt, bool is_default)
    : type(opt.type), accum_types_provided(opt.accum_types_provided),
      accum_type(opt.accum_type_is_half ? popart::DataType::FLOAT16
                                        : popart::DataType::FLOAT),
      first_order_momentum_accum_type(
          opt.first_order_momentum_accum_type_is_half
              ? popart::DataType::FLOAT16
              : popart::DataType::FLOAT),
      second_order_momentum_accum_type(
          opt.second_order_momentum_accum_type_is_half
              ? popart::DataType::FLOAT16
              : popart::DataType::FLOAT),
      use_tf_variant(opt.use_tf_variant), max_grad_norm(opt.max_grad_norm) {
  // In Popart the attributes which can be specified per group are prefixed with
  // "default" For example learningRate -> defaultLearningRate In order to keep
  // it simple the PopTorch frontend will always use the group name, therefore
  // here we need to remap the PopTorch names to the Popart ones in the default
  // case we then fall back onto the default names for the remaining attributes
  // (e.g lossScaling)
  std::vector<std::string> poptorch_names = getAttributeNames(opt.type, false);
  std::vector<std::string> popart_names =
      getAttributeNames(opt.type, is_default);
  if (is_default) {
    poptorch_names.reserve(popart_names.size());
    for (std::uint64_t i = poptorch_names.size(); i < popart_names.size();
         ++i) {
      poptorch_names.push_back(popart_names[i]);
    }
  }
  std::vector<std::string> provided_names;
  provided_names.reserve(poptorch_names.size());
  for (const auto &p : opt.parameters) {
    const std::string name = reinterpret_cast<const char *>(p.name);
    provided_names.push_back(name);
    const auto idx = indexOf(poptorch_names, name);
    ERROR_ON_MSG(idx < 0,
                 "Unexpected "
                     << (is_default ? "" : "group ") << "attribute " << name
                     << " for optimizer " << toString(type)
                     << ", allowed values: " << toString(poptorch_names));
    ERROR_ON(
        !params.emplace(popart_names[idx], std::make_pair(p.value, p.is_const))
             .second);
  }
  ERROR_ON_MSG(opt.parameters.size() != poptorch_names.size(),
               "Missing attributes: "
                   << toString(type) << " optimizers require values for "
                   << toString(vectorDiff(provided_names, poptorch_names)));
}

void assertSingleInstanceMaxNumIPUs(std::size_t num_ipus) {
  ERROR_ON_MSG(num_ipus > 64, "Too many IPUs requested ("
                                  << num_ipus
                                  << "). Experiments that need more than 64 "
                                     "IPUs require distributed execution.");
}

} // namespace

namespace detail {

popart::ConstVoidData StepIO::in(popart::TensorId id, int64_t num_elems,
                                 bool prefetch, bool /*isBroadcast*/) {
  (void)prefetch;
  timestamp(&_in_times, id);
  return get<popart::ConstVoidData>(id, &_inputs_info, num_elems, true);
}

void StepIO::inComplete(popart::TensorId id, int64_t num_elems,
                        bool /*isBroadcast*/) {
  (void)num_elems;
  timestamp(&_in_complete_times, id);
}

popart::MutableVoidData StepIO::out(popart::TensorId id, int64_t num_elems) {
  timestamp(&_out_times, id);
  return get<popart::MutableVoidData>(id, &_outputs_info, num_elems, false);
}

void StepIO::outComplete(popart::TensorId id) {
  timestamp(&_out_complete_times, id);
}

void StepIO::computeStepDataInfo(const popart::TensorId &id,
                                 popart::IArray *array) {
  if (_step_data_info.find(id) != _step_data_info.end()) {
    return;
  }

  const auto dtype = AccessorType::getArrayDataType(*array);
  const auto rank = AccessorType::getArrayRank(*array);
  std::vector<int64_t> shape;

  for (size_t i = 0; i < rank; ++i) {
    shape.push_back(AccessorType::getArrayDim(*array, i));
  }

  _step_data_info.insert({id, popart::TensorInfo(dtype, shape)});
}

void StepIO::populate(const TensorArrayMap &inputs,
                      const TensorArrayMap &outputs) {
  _inputs_info.clear();
  for (const auto &input : inputs) {
    _inputs_info.insert({input.first, {input.second, 0, 0, 0}});
    _in_times[input.first].clear();
    _in_complete_times[input.first].clear();
    computeStepDataInfo(input.first, &input.second);
  }

  _outputs_info.clear();
  for (const auto &output : outputs) {
    _outputs_info.insert({output.first, {output.second, 0, 0, 0}});
    _out_times[output.first].clear();
    _out_complete_times[output.first].clear();
    computeStepDataInfo(output.first, &output.second);
  }
}

template <typename T>
T StepIO::get(const popart::TensorId &id, TensorArrayInfo *map,
              int64_t num_elems, bool is_input) {
  auto it = map->find(id);
  ERROR_ON_MSG(it == map->end(), "Internal Compiler Error in StepIO");
  auto &array_info = it->second;

  auto it2 = _step_data_info.find(id);
  ERROR_ON_MSG(it2 == _step_data_info.end(),
               "Internal Compiler Error in StepIO");

  T step_data;
  step_data.info = it2->second;

  uint8_t *ptr =
      static_cast<uint8_t *>(AccessorType::getDataPointer(array_info.array));

  const int64_t num_bytes =
      static_cast<int64_t>(step_data.info.getDataTypeInfo()->nbytes()) *
      num_elems;
  if (is_input && array_info.offset == array_info.end_offset) {
    int64_t tidx;
    const int64_t input_group_count = _replica_count / _input_group_size;

    if (_input_cgt == popart::CommGroupType::Consecutive) {
      tidx = array_info.replica_idx / _input_group_size;
    } else {
      ERROR_ON_MSG(_input_cgt != popart::CommGroupType::Orthogonal,
                   "Unexpected input CommGroupType " << _input_cgt);
      tidx = array_info.replica_idx % input_group_count;
    }
    array_info.offset = tidx * (step_data.info.nbytes() / input_group_count);
    array_info.end_offset =
        ((tidx + 1) * (step_data.info.nbytes() / input_group_count)) %
        step_data.info.nbytes();
    array_info.replica_idx = (array_info.replica_idx + 1) % _replica_count;
  }

  ptr += array_info.offset;
  array_info.offset = (array_info.offset + num_bytes) % step_data.info.nbytes();

  step_data.data = ptr;
  return step_data;
}

void StepIO::timestamp(TensorTimestamps *time, const popart::TensorId &id) {
  auto now = std::chrono::system_clock::now().time_since_epoch();
  auto stamp =
      static_cast<double>(
          std::chrono::duration_cast<std::chrono::milliseconds>(now).count()) /
      1000;
  time->at(id).push_back(stamp);
}

void StepIO::setInputGroupings(popart::CommGroupType type,
                               int64_t input_group_size,
                               int64_t replica_count) {
  _input_cgt = type;
  _input_group_size = input_group_size;
  _replica_count = replica_count;
}

const std::vector<popart::TensorId> &WeightsIO::parameterIds() const {
  return _weights_order;
}

bool WeightsIO::contains(popart::TensorId id) const {
  return _weights.find(id) != _weights.end();
}

popart::MutableVoidData WeightsIO::weight(popart::TensorId id) const {
  return _weights.at(id);
}

void WeightsIO::registerParameter(const popart::TensorId &id,
                                  const popart::TensorInfo &info) {
  ERROR_ON(contains(id));
  _weights[id].info = info;
  _weights_order.push_back(id);
}

void WeightsIO::updateData(const std::vector<void *> &host_buffers) {
  ERROR_ON(host_buffers.size() != _weights_order.size());
  for (std::uint64_t i = 0; i < host_buffers.size(); ++i) {
    if (host_buffers[i] != nullptr) {
      _weights[_weights_order[i]].data = host_buffers[i];
    }
  }
}

bool ConstVoidDataLessThan::operator()(const popart::ConstVoidData &lhs,
                                       const popart::ConstVoidData &rhs) const {
  // Optional data should not be set
  ERROR_ON(lhs.storesData());
  ERROR_ON(rhs.storesData());

  // First compare on data type
  if (lhs.info.dataType() != rhs.info.dataType()) {
    return lhs.info.dataType() < rhs.info.dataType();
  }

  // Next compare on shape
  const auto &lhs_shape = lhs.info.shape();
  const auto &rhs_shape = rhs.info.shape();

  if (lhs_shape != rhs_shape) {
    // Shape is a vector so uses std::lexicographical_compare
    return lhs_shape < rhs_shape;
  }

  // Otherwise, compare underlying data
  ERROR_ON(lhs.info.nbytes() != rhs.info.nbytes());
  return std::memcmp(lhs.data, rhs.data, lhs.info.nbytes()) < 0;
}

CompilerImpl::~CompilerImpl() {
  if (_device && isAttachedToDevice()) {
    detachFromDevice();
  }
}

void CompilerImpl::setExecutionStrategyAttributes(
    const std::set<popart::TensorId> &tensors) {
  ERROR_ON_MSG(active_ipu < 0,
               "No active Block, all the ops must belong to a Block");
  switch (options.execution_mode) {
  case ExecutionMode::Pipelined:
  case ExecutionMode::Sharded:
    active_builder->pipelineStage(tensors, active_stage);
    break;
  case ExecutionMode::Phased:
    ERROR_ON(active_phase < 0);
    active_builder->executionPhase(tensors, active_phase);
    break;
  default:
    ERROR("Invalid ExecutionMode active");
  }
  used_ipus.insert(active_ipu);
  active_builder->virtualGraph(tensors, active_ipu);
}

std::string CompilerImpl::checkSystemConfig() const {
  ERROR_ON_MSG(num_ipus == 0, "Must call createDevice() first");
  auto &dm = popart::DeviceManager::createDeviceManager();
  if (dm.enumerateDevices().empty()) {
    return "\nNo IPU detected in the system. \nFor more information use "
           "the Graphcore command-line tool `gc-monitor`.";
  }
  if (options_set.count("ipu_id") != 0u) {
    return "";
  }
  if (dm.enumerateDevices(options.sync_pattern, num_ipus).empty()) {
    return fmt::format("\nNo device found on the system with {} IPUs: the "
                       "configuration needs changing",
                       num_ipus);
  }
  return "";
}

void CompilerImpl::updateUseModelConfig() {
  // The configuration set by the application takes precedence over everything
  // else.
  if (options_set.count("use_model") != 0u) {
    logging::info("From the user configuration: Ipu model: {}",
                  options.ipu_model ? "Enabled" : "Disabled");
  } else if (ipuModelEnvironmentVariableIsEnabled() ||
             ipuSmallModelEnvironmentVariableIsEnabled()) {
    // As a fallback the model can be enabled by the POPTORCH_IPU_MODEL
    // environment variable.
    options.ipu_model = true;
  } else {
    options.ipu_model = false;
  }
}

std::uint64_t CompilerImpl::numPipelineStages() {
  ERROR_ON(options.execution_mode != ExecutionMode::Pipelined);

  // Every time the IPU ID changes, there is an additional stage. In PopTorch,
  // two blocks/stages with the same IPU ID will be merged.
  std::uint64_t const forward_stages = num_ipu_switches + 1;

  // If training, there are twice the number of stages for backpropagation
  // minus one (because on the last IPU, the backpropagation happens as part of
  // the same pipeline stage).
  // (NB this is an upper bound, as tensor.detach() could cut off stages, but
  // we ignore unusual edge cases.)
  if (is_training) {
    return forward_stages * 2 - 1;
  }

  return forward_stages;
}

void CompilerImpl::addMemoryToOutput(TensorId id, void *ptr,
                                     std::unique_ptr<popart::IArray> &&memory) {
  if (isHostSideConstant(id)) {
    getHostSideConstant(id).copyDataTo(ptr);
    return;
  }

  memory_manager.push_back(std::move(memory));

  popart::TensorId const popart_id = ids[id];
  if (!popart_outgoing.insert({popart_id, *memory_manager.back().get()})
           .second) {
    // Insertion in the map failed because there is already a pointer associated
    // with that id.
    outgoing_duplicates[popart_id].push_back(ptr);
  }
}

void CompilerImpl::addOutputTensor(
    const std::vector<popart::TensorId> &tensors) {
  active_builder->addOutputTensor(tensors.at(0));
}

popart::TensorId
CompilerImpl::reshape(const std::vector<popart::TensorId> &tensors,
                      const std::vector<int64_t> &shape) {
  auto ai_onnx = active_builder->aiOnnxOpset11();

  popart::Shape const s = {static_cast<int64_t>(shape.size())};
  popart::TensorInfo const tensor_info("INT64", s);
  auto new_shape = ai_onnx.constant({shape.data(), tensor_info});
  return ai_onnx.reshape({tensors.at(0), new_shape},
                         getDebugContext("Reshape"));
}

std::vector<popart::TensorId> CompilerImpl::customOperation(
    const std::vector<popart::TensorId> &args, const std::string &op,
    const std::string &domain, std::int64_t version, std::int64_t num_outputs,
    const std::shared_ptr<std::vector<PopartAttribute>> &attributes) {
  logging::info("Adding custom op with {} inputs ",
                static_cast<std::int32_t>(args.size()));

  // Convert to the the format required for Popart. We cannot use popart::any
  // as a known type externally in poptorch to avoid needing popart headers.
  std::map<std::string, popart::any> attributes_map;
  for (auto &attribute : *attributes) {
    attributes_map[attribute.name()] = *(attribute.getValue());
  }

  if (!attributes->empty()) {
    std::stringstream ss;
    ss << "Attributes: ";

    for (auto &attribute : *attributes) {
      ss << attribute.name();

      if (&attribute != &attributes->back()) {
        ss << ", ";
      }
    }
    logging::trace(ss.str().c_str());
  }

  const std::int32_t num_inputs = static_cast<std::int32_t>(args.size());
  popart::OperatorIdentifier const id = {domain, op, 1, num_inputs};

  return active_builder->customOp(id, version, args, num_outputs,
                                  attributes_map, getDebugContext(op));
}

popart::TensorId CompilerImpl::recomputationCheckpoint(
    const std::vector<popart::TensorId> &tensors) {
  // Popart is simply a for loop over vector inputs and it is better for the
  // PyTorch Graph to avoid Tuple/List packs and unpacks
  ERROR_ON(tensors.size() != 1);
  return active_builder->checkpointOutput(tensors)[0];
}

popart::TensorId
CompilerImpl::tensorConstant(const std::vector<popart::TensorId> &tensors,
                             const PopartConstant &constant) {
  UNUSED(tensors);

  // Use the cache for the active builder. This is effectively one cache per
  // subgraph as constants only exist on one graph.
  auto &current_cache(_constants_cache[active_builder]);

  // Reuse a tensor if an identical one exists already
  if (current_cache.count(constant.getPopartData()) != 0u) {
    return current_cache[constant.getPopartData()];
  }

  // To preserve memory, use a clone of the data
  const size_t buff_size = constant.getPopartData().info.nbytes();
  _constant_cloned_data.emplace_back(new char[buff_size]);
  auto *new_buff = reinterpret_cast<void *>(&_constant_cloned_data.back()[0]);
  std::memcpy(new_buff, constant.getPopartData().data, buff_size);
  popart::ConstVoidData const new_constant(new_buff,
                                           constant.getPopartData().info);

  auto ai_onnx = active_builder->aiOnnxOpset11();
  popart::TensorId tensor = ai_onnx.constant(
      new_constant, false /*is_value_sparse*/, getDebugContext("Constant"));

  current_cache[new_constant] = tensor;

  return tensor;
}

TensorId CompilerImpl::hostSideTensorConstant(
    const std::vector<popart::TensorId> &tensors, HostSideConstant constant) {
  UNUSED(tensors);
  _host_side_constants.emplace(std::make_pair(ids.size(), std::move(constant)));

  // Add a dummy into ids
  ids.emplace_back("__poptorch__host_side_constant");

  return ids.size() - 1;
}

std::shared_ptr<popart::DeviceInfo>
CompilerImpl::createDevice(bool must_attach) {
  auto connection_type = options.connection_type;
  if (must_attach) {
    ERROR_ON_MSG(
        connection_type == popart::DeviceConnectionType::Never,
        "[Internal] must_attach incompatible with connection type Never");
    connection_type = popart::DeviceConnectionType::Always;
    _device = nullptr;
  } else {
    ERROR_ON_MSG(_device, "device already created");
  }
  updateUseModelConfig();
  ERROR_ON(used_ipus.empty());

  // Sometimes phased execution doesn't use all of the IPUs in a range, so check
  // the Ids too.
  const auto max_ipu_id = *std::max_element(used_ipus.begin(), used_ipus.end());
  num_ipus = std::max(used_ipus.size(), max_ipu_id + 1) *
             popart_options.replicatedGraphCount;
  ERROR_ON_MSG(num_ipus == 0, "Your compiled model is empty (All the "
                              "operations have been optimised out)");
  assertSingleInstanceMaxNumIPUs(num_ipus);
  if (options.ipu_model) {
    if (popart_options.enableEngineCaching) {
      logging::warn("enableExecutableCaching doesn't work with the IPU model");
    }

    errorOnCycleLogging();

    std::map<std::string, std::string> model_options;
    model_options["numIPUs"] = std::to_string(num_ipus);
    std::string const env_ipu_model_version = getIpuModelVersion();
    model_options["ipuVersion"] = env_ipu_model_version;
    const int num_tiles_per_ipu = getNumTilesPerIpu(env_ipu_model_version);
    model_options["tilesPerIPU"] = std::to_string(num_tiles_per_ipu);

    ERROR_ON_MSG(connection_type == popart::DeviceConnectionType::Never,
                 "ConnectionType.Never / poptorch.Options.useOfflineIpuTarget "
                 "not supported for the IPU model");
    _device = popart::DeviceManager::createDeviceManager().createIpuModelDevice(
        model_options);
    // Acquired HW devices will be attached if the used connection type is
    // Always but createIpuModelDevice() doesn't take a connection type
    // so we manually attach to the device if the connection type is needed.
    if (connection_type == popart::DeviceConnectionType::Always) {
      ERROR_ON_MSG(!_device->attach(), "Internal error: attach can't fail for "
                                       "model devices");
    }
    logging::debug("Instantiated device, running on IPU model with {} tiles.",
                   num_tiles_per_ipu);
  } else {
    if (connection_type == popart::DeviceConnectionType::Never) {
      // Offline compilation path: create an offline device regardless of what's
      // present on the system.
      ERROR_ON_MSG(options_set.count("ipu_id"),
                   "Offline compilation targeting a specific id not supported");
      errorOnCycleLogging();

      const auto get_ipu_version = [&]() -> std::int64_t {
        if (options.ipu_version == CompilerOptions::use_system_ipu_version) {
          return ipuHardwareVersion();
        }
        return options.ipu_version;
      };

      std::map<std::string, std::string> device_options;
      device_options["numIPUs"] = std::to_string(num_ipus);
      device_options["ipuVersion"] = "ipu" + std::to_string(get_ipu_version());
      device_options["syncPattern"] =
          popart::syncPatternToString(options.sync_pattern);
      _device =
          popart::DeviceManager::createDeviceManager().createOfflineIPUDevice(
              device_options);
      ERROR_ON_MSG(!_device, "Failed to create offline IPU device");
    } else {
      // Round up number of ipus to a power of 2.
      const auto rounded_num_ipus = roundUpNumIPUs(num_ipus);

      if (rounded_num_ipus != num_ipus) {
        std::string const common_msg(
            ", because PopTorch must reserve a power of 2 or"
            " maximum of 64 IPUs per process");
        if (options.auto_round_num_ipus) {
          logging::warn("Reserving {} IPUs when the model specifices the use "
                        "of only {}{}. {} will be reserved but not used.",
                        rounded_num_ipus, num_ipus, common_msg,
                        rounded_num_ipus - num_ipus);
          num_ipus = rounded_num_ipus;
        } else {
          ERROR("The model specifies the use of "
                << num_ipus
                << " IPUs, "
                   "however PopTorch must reserve a minimum of "
                << rounded_num_ipus << " in order to allow the model to run"
                << common_msg
                << ". Please reconfigure your model to use a "
                   "different number of IPUs or set "
                   "poptorch.Options().autoRoundNumIPUs(True).");
        }
      }
      assertSingleInstanceMaxNumIPUs(num_ipus);
      do {
        // Regular IPU hardware target
        if (options_set.count("ipu_id") == 0u) {
          _device = popart::DeviceManager::createDeviceManager()
                        .tryAcquireAvailableDevice(
                            num_ipus, 0, options.sync_pattern, connection_type);
          ERROR_ON_MSG(!_device && !waitIfUnavailable(),
                       "Failed to acquire " << num_ipus << " IPU(s)"
                                            << this->checkSystemConfig());
          if (_device) {
            logging::debug("Acquired {} IPU(s): running on device Id {}.",
                           num_ipus, _device->getId());
          }
        } else {
          _device =
              popart::DeviceManager::createDeviceManager().tryAcquireDeviceById(
                  options.ipu_id, options.sync_pattern, connection_type);
          ERROR_ON_MSG(!_device && !waitIfUnavailable(),
                       "Failed to acquire device Id " << options.ipu_id
                                                      << checkSystemConfig());
          ERROR_ON_MSG(_device && static_cast<std::uint64_t>(
                                      _device->getNumIpus()) < num_ipus,
                       "Expected at least replication factor * used IPUs = "
                           << used_ipus.size() << " * "
                           << popart_options.replicatedGraphCount << " = "
                           << num_ipus << " device Ids but the user provided "
                           << _device->getNumIpus());
          if (_device &&
              static_cast<std::uint64_t>(_device->getNumIpus()) != num_ipus) {
            logging::warn(
                "Expected replication factor * used IPUs = {} * {} "
                "= {} device Ids but the device selected has {} IPUs which "
                "means some of them will not be used.",
                used_ipus.size(), popart_options.replicatedGraphCount, num_ipus,
                _device->getNumIpus());
          }
          if (_device) {
            logging::debug("Acquired IPU device with id {}, running on device.",
                           options.ipu_id);
          }
        }
      } while (!_device && waitForAWhile());
    }
  }
  if (_device->isAttached()) {
    logging::trace("Attached to device {}", _device->getId());
  }
  return _device;
}

void CompilerImpl::detachFromDevice() {
  if (used_ipus.empty()) {
    return;
  }

  logging::trace("Begin detaching device {}", _device->getId());
  ERROR_ON_MSG(!_device, "Cannot find a valid device");
  ERROR_ON_MSG(!_device->isAttached(), "The device has already been detached");
  _device->detach();
  logging::debug("Detached from device {}", _device->getId());
}

bool CompilerImpl::isAttachedToDevice() const {
  if (used_ipus.empty()) {
    // We are always attached to at least 0 IPUs.
    return true;
  }

  ERROR_ON_MSG(!_device, "Cannot find a valid device");
  return _device->isAttached();
}

template <typename OptimizerType>
void CompilerImpl::updateGroups(OptimizerType *optimizer,
                                const std::vector<Optimizer> &optimizers) {
  // For each optimizer group.
  for (std::size_t idx = 1; idx < optimizers.size(); ++idx) {
    // Index 0 is 'defaults'
    const std::size_t group = idx - 1;
    const OptimizerParameters group_opt{optimizers[idx], false};
    logging::debug(
        "Updating group {} optimizer with {} for (tensors affected {})", group,
        group_opt.debug(), toString(grad_update_groups[group]));
    // For each tensor in the group.
    for (const popart::TensorId &id : grad_update_groups[group]) {
      // Update the optimizer
      optimizer->insertSpecific(id, group_opt.params);
    }
  }
}

std::unique_ptr<popart::Optimizer>
CompilerImpl::getPopartOptimizer(std::vector<Optimizer> optimizers) {
  if (optimizers.empty()) {
    return nullptr;
  }

  // If using the separate tensor variant, glue velocity scaling to loss
  // scaling. When T39344 is completed, there will be no benefit to setting
  // velocity scaling different to loss scaling for the separate tensor case.

  // The first optimizer contains the default values.
  auto &default_value_optimizer(optimizers[0]);

  if (default_value_optimizer.type == OptimizerType::SGD2) {
    copyParam(default_value_optimizer, default_value_optimizer, "lossScaling",
              "velocityScaling");
  }

  // The first optimizer contains the default values.
  const OptimizerParameters opt{optimizers[0], true};

  // Print to debug the new optimizer.
  logging::debug("Updating graph optimizer with {}", opt.debug());

  ERROR_ON_MSG(std::isnan(opt.max_grad_norm),
               "Maximum norm of gradients cannot be NaN");

  std::vector<popart::ClipNormSettings> clipnorms;
  if (opt.max_grad_norm != std::numeric_limits<float>::infinity()) {
    clipnorms.push_back(
        popart::ClipNormSettings::clipAllWeights(opt.max_grad_norm));
  }

  switch (opt.type) {
  case OptimizerType::SGD1: {
    ERROR_ON(!opt.accum_types_provided);
    auto optimizer = std::unique_ptr<popart::SGD>(new popart::SGD(
        opt.params, clipnorms, popart::SGDAccumulatorAndMomentum::Combined,
        popart::DataType::UNDEFINED, popart::DataType::UNDEFINED,
        getDebugContext("SGD")));
    updateGroups(optimizer.get(), optimizers);
    return optimizer;
  }
  case OptimizerType::SGD2: {
    ERROR_ON(!opt.accum_types_provided);

    // Copy loss scaling to velocity scaling for all groups
    for (std::size_t idx = 1; idx < optimizers.size(); ++idx) {
      copyParam(optimizers[idx], default_value_optimizer, "lossScaling",
                "velocityScaling");
    }

    auto optimizer = std::unique_ptr<popart::SGD>(new popart::SGD(
        opt.params, clipnorms, popart::SGDAccumulatorAndMomentum::Separate,
        opt.accum_type, opt.first_order_momentum_accum_type,
        getDebugContext("SGD")));
    updateGroups(optimizer.get(), optimizers);
    return optimizer;
  }
  case OptimizerType::ADAM:
  case OptimizerType::ADAMW:
  case OptimizerType::ADAMW_NO_BIAS:
  case OptimizerType::LAMB:
  case OptimizerType::LAMB_NO_BIAS: {
    auto adam_mode = popart::AdamMode::Adam;
    auto decay_mode = popart::WeightDecayMode::Decay;
    if (opt.type == OptimizerType::ADAM) {
      decay_mode = popart::WeightDecayMode::L2Regularization;
    } else if (opt.type == OptimizerType::ADAMW) {
      adam_mode = popart::AdamMode::Adam;
    } else if (opt.type == OptimizerType::LAMB) {
      adam_mode = popart::AdamMode::Lamb;
    } else if (opt.type == OptimizerType::LAMB_NO_BIAS) {
      adam_mode = popart::AdamMode::LambNoBias;
    }

    // NB WeightDecayMode set to default WeightDecayMode::Decay meaning true
    // weight decay rather than L2
    ERROR_ON(!opt.accum_types_provided);
    auto optimizer = std::make_unique<popart::Adam>(
        opt.params, adam_mode, decay_mode, opt.accum_type,
        opt.first_order_momentum_accum_type,
        opt.second_order_momentum_accum_type, clipnorms, false,
        getDebugContext("Adam"));
    updateGroups(optimizer.get(), optimizers);
    return optimizer;
  }
  case OptimizerType::RMSPROP:
  case OptimizerType::RMSPROP_CENTERED: {
    ERROR_ON(!opt.accum_types_provided);
    popart::AdaptiveMode const mode =
        opt.type == OptimizerType::RMSPROP
            ? popart::AdaptiveMode::RMSProp
            : popart::AdaptiveMode::CenteredRMSProp;
    auto optimizer = std::make_unique<popart::Adaptive>(
        opt.params, mode, popart::WeightDecayMode::L2Regularization,
        opt.accum_type, opt.first_order_momentum_accum_type,
        opt.second_order_momentum_accum_type, popart::DataType::FLOAT,
        opt.use_tf_variant, getDebugContext("Adaptive"));
    updateGroups(optimizer.get(), optimizers);
    return optimizer;
  }
  default:
    ERROR("Unreachable: Unsupported optimizer.");
  }
}

popart::TensorId
CompilerImpl::addNotInPlace(const std::vector<popart::TensorId> &in) {
  auto ai_onnx = active_builder->aiOnnxOpset11();
  popart::TensorId output = ai_onnx.add(in, getDebugContext("AddNotInPlace"));
  active_builder->setInplacePreferences(
      output, {{"AddLhsInplace", -1}, {"AddRhsInplace", -1}});
  return output;
}

popart::TensorId
CompilerImpl::randomNormal(const std::vector<popart::TensorId> &tensors,
                           const std::vector<int64_t> &shape, float mean,
                           float scale, const std::string &dtype) {
  UNUSED(tensors);
  auto ai_onnx = active_builder->aiOnnxOpset11();
  const auto pdt = popart::dataTypeFromString(dtype);
  return ai_onnx.randomnormal(shape, popart::getONNXDataTypeAsInt(pdt), mean,
                              scale, nonstd::optional<float>(),
                              getDebugContext("Randomnormal"));
}

popart::TensorId
CompilerImpl::randomUniform(const std::vector<popart::TensorId> &tensors,
                            const std::vector<int64_t> &shape, float high,
                            float low, const std::string &dtype) {
  UNUSED(tensors);
  auto ai_onnx = active_builder->aiOnnxOpset11();
  const auto pdt = popart::dataTypeFromString(dtype);
  return ai_onnx.randomuniform(shape, popart::getONNXDataTypeAsInt(pdt), high,
                               low, nonstd::optional<float>(),
                               getDebugContext("Randomuniform"));
}

popart::TensorId
CompilerImpl::ones(const std::vector<popart::TensorId> &tensors,
                   const std::vector<int64_t> &shape,
                   const std::string &dtype) {
  return zerosOrOnes(tensors, shape, dtype, false);
}

popart::TensorId
CompilerImpl::zeros(const std::vector<popart::TensorId> &tensors,
                    const std::vector<int64_t> &shape,
                    const std::string &dtype) {
  return zerosOrOnes(tensors, shape, dtype, true);
}

popart::TensorId
CompilerImpl::zerosOrOnes(const std::vector<popart::TensorId> &tensors,
                          const std::vector<int64_t> &shape,
                          const std::string &dtype, bool zeros) {
  auto total_size = static_cast<std::size_t>(std::accumulate(
      shape.begin(), shape.end(), 1, std::multiplies<std::size_t>()));

  if (dtype == "INT32") {
    std::vector<int32_t> const_buff(total_size, zeros ? 0 : 1);
    const PopartConstant popart_const(PopartType::INT32, const_buff.data(),
                                      shape);
    return tensorConstant(tensors, popart_const);
  }
  if (dtype == "FLOAT") {
    std::vector<float> const_buff(total_size, zeros ? 0 : 1);
    const PopartConstant popart_const(PopartType::FLOAT, const_buff.data(),
                                      shape);
    return tensorConstant(tensors, popart_const);
  }
  if (dtype == "FLOAT16") {
    std::vector<uint16_t> const_buff(total_size,
                                     popart::floatToHalf(zeros ? 0 : 1));
    const PopartConstant popart_const(PopartType::FLOAT16, const_buff.data(),
                                      shape);
    return tensorConstant(tensors, popart_const);
  }
  if (dtype == "BOOL") {
    struct Bool {
      bool b;
    };
    std::vector<Bool> const_buff(total_size, {!zeros});
    const PopartConstant popart_const(PopartType::BOOL, &(const_buff[0].b),
                                      shape);
    return tensorConstant(tensors, popart_const);
  }
  ERROR("Unsupported type " << dtype);
}

popart::TensorId
CompilerImpl::unfold(const std::vector<popart::TensorId> &tensors,
                     int64_t dimension, int64_t size, int64_t step) {
  // Implements the TUnfoldHelper interface in Poprithms using ONNX operations.
  struct PoptorchUnfoldHelper {
    struct InternalState {
      CompilerImpl *parent;
      popart::Builder *builder;
      popart::TensorId tensor;

      popart::TensorId scalarConstI64(int64_t val) const {
        const PopartConstant val_const(PopartType::INT64, &val, {});
        return parent->tensorConstant({}, val_const);
      }

      popart::TensorId shapeAsTensor(const std::vector<uint64_t> &shape) const {
        std::vector<int64_t> new_shape(shape.begin(), shape.end());
        const PopartConstant shape_const(
            PopartType::INT64, new_shape.data(),
            {static_cast<int64_t>(new_shape.size())});
        return parent->tensorConstant({}, shape_const);
      }

      InternalState transform(popart::TensorId &&new_id) const {
        InternalState new_state(*this);
        new_state.tensor = std::move(new_id);
        parent->setExecutionStrategyAttributes({new_state.tensor});
        return new_state;
      }
    };

    static InternalState slice(const InternalState &state, uint64_t dim,
                               uint64_t start, uint64_t end) {
      auto dims = state.scalarConstI64(dim);
      auto starts = state.scalarConstI64(start);
      auto ends = state.scalarConstI64(end);

      return state.transform(state.builder->aiOnnxOpset11().slice(
          {state.tensor, starts, ends, dims}));
    }

    static InternalState broadcast(const InternalState &state, uint64_t N,
                                   uint64_t dim) {
      auto new_shape = shape(state);
      ERROR_ON(new_shape[dim] != 1);
      new_shape[dim] *= N;
      auto shape_tensor = state.shapeAsTensor(new_shape);

      return state.transform(
          state.builder->aiOnnxOpset11().expand({state.tensor, shape_tensor}));
    }

    static InternalState reshape(const InternalState &state,
                                 const std::vector<uint64_t> &shape) {
      auto shape_tensor = state.shapeAsTensor(shape);

      return state.transform(
          state.builder->aiOnnxOpset11().reshape({state.tensor, shape_tensor}));
    }

    static InternalState concat(const std::vector<InternalState> &states,
                                uint64_t axis) {
      ERROR_ON(states.empty());

      std::vector<popart::TensorId> tensor_ids;
      tensor_ids.reserve(states.size());
      for (const auto &tensor : states) {
        tensor_ids.push_back(tensor.tensor);
      }

      const auto &first = states.front();
      return first.transform(first.builder->aiOnnxOpset11().concat(
          tensor_ids, static_cast<int64_t>(axis)));
    }

    static InternalState dimShuffle(const InternalState &state,
                                    const std::vector<uint64_t> &permutation) {
      const std::vector<int64_t> permutation_ints(permutation.begin(),
                                                  permutation.end());
      state.builder->setAttribute("perm", permutation_ints);
      auto new_tensor = state.transform(
          state.builder->aiOnnxOpset11().transpose({state.tensor}));
      state.builder->clearAttribute("perm");
      return new_tensor;
    }

    static uint64_t dim(const InternalState &state, uint64_t axis) {
      return state.builder->getTensorShape(state.tensor)[axis];
    }

    static uint64_t rank_u64(const InternalState &state) { // NOLINT
      return static_cast<uint64_t>(
          state.builder->getTensorShape(state.tensor).size());
    }

    static std::vector<uint64_t> shape(const InternalState &state) {
      const auto &&my_shape = state.builder->getTensorShape(state.tensor);
      return std::vector<uint64_t>(my_shape.begin(), my_shape.end());
    }
  };

  ERROR_ON(dimension < 0);
  ERROR_ON(size < 0);
  ERROR_ON(step < 0);
  ERROR_ON(tensors.size() != 1);
  const auto &first = tensors.front();

  using T = PoptorchUnfoldHelper::InternalState;
  using H = PoptorchUnfoldHelper;

  return poprithms::ndarray::Unfolder<T, H>::unfold(
             {this, active_builder, first}, dimension, size, step)
      .tensor;
}

popart::TensorId CompilerImpl::prelu(std::vector<popart::TensorId> &tensors) {
  const popart::TensorId &self = tensors[0];
  popart::TensorId &weight = tensors[1];

  const auto self_shape = active_builder->getTensorShape(self);
  const auto weight_shape = active_builder->getTensorShape(weight);

  if (self_shape.size() > weight_shape.size() + 1) {
    // PyTorch's implementation adds some extra singleton dimensions to the
    // weight to ensure it is 'unidirectionally broadcastable' with the input.
    std::vector<std::int64_t> unsqueeze_axes(self_shape.size() -
                                             weight_shape.size() - 1);
    std::iota(unsqueeze_axes.begin(), unsqueeze_axes.end(),
              weight_shape.size());
    weight = active_builder->aiOnnxOpset11().unsqueeze(
        {weight}, unsqueeze_axes, getDebugContext("Unsqueeze"));
    setExecutionStrategyAttributes({weight});
  }

  return active_builder->aiOnnxOpset11().prelu(tensors,
                                               getDebugContext("Prelu"));
}

const HostSideConstant &CompilerImpl::getHostSideConstant(TensorId id) const {
  return _host_side_constants.at(id);
}

bool CompilerImpl::isHostSideConstant(TensorId id) const {
  return _host_side_constants.count(id) != 0u;
}

void CompilerImpl::addMultiConvPart(
    const std::vector<popart::TensorId> &tensors,
    const std::vector<int64_t> &dilations,
    const std::vector<int64_t> &kernel_shape, const std::vector<int64_t> &pads,
    const std::vector<int64_t> &strides) {
  if (multi_conv_builder == nullptr) {
    multi_conv_builder = std::make_unique<MultiConvBuilder>();
  }

  multi_conv_builder->addConv(tensors, dilations, kernel_shape, pads, strides);
}

void CompilerImpl::setAttribute(const std::string &attribute,
                                const std::string &key,
                                const std::string &value) {
  auto &attrs = _attribute_key_value_map[attribute];
  attrs[key] = value;
  std::vector<std::string> attrs_vec;
  for (auto &attr : attrs) {
    attrs_vec.push_back(attr.first);
    attrs_vec.push_back(attr.second);
  }
  active_builder->setAttribute(attribute, attrs_vec);
}

void CompilerImpl::clearAttribute(const std::string &attribute,
                                  const std::string &key) {
  auto &attrs = _attribute_key_value_map[attribute];
  ERROR_ON_MSG(attrs.erase(key) == 0, "Unknown key '" << key
                                                      << "' for attribute '"
                                                      << attribute << "'.");
  active_builder->clearAttribute(attribute);
  if (attrs.empty()) {
    ERROR_ON_MSG(_attribute_key_value_map.erase(attribute) == 0,
                 "Unknown attribute '" << attribute << "'.");
  } else {
    std::vector<std::string> attrs_vec;
    for (auto &attr : attrs) {
      attrs_vec.push_back(attr.first);
      attrs_vec.push_back(attr.second);
    }
    active_builder->setAttribute(attribute, attrs_vec);
  }
}

popart::DebugContext CompilerImpl::getDebugContext(const std::string &name) {
  std::string const op_name = op_builder->getNameScope() + name;
  popart::DebugContext const dc(name, code_location);
  popart::DebugInfo di(dc, "poptorch");
  di.setValue("torch_schema", torch_node);
  di.setValue("op_type", name);
  di.setValue("op_name", op_name);
  return {di};
}

std::vector<popart::TensorId> CompilerImpl::endMultiConv() {
  ERROR_ON_MSG(multi_conv_builder == nullptr, "Unexpected end_multi_conv.");
  auto outs = multi_conv_builder->build(active_builder);
  multi_conv_builder.reset();
  return outs;
}

bool CompilerImpl::waitIfUnavailable() const {
  // Force disable the wait if the system doesn't contain an IPU that
  // matches the requested config.
  static const bool should_wait =
      waitIfIpuIsUnavailable() && checkSystemConfig().empty();
  return should_wait;
}

void CompilerImpl::attachToDevice() {
  if (used_ipus.empty()) {
    // We are always attached to at least 0 IPUs.
    return;
  }

  ERROR_ON_MSG(_device->isAttached(), "Already attached to a device");

  // TODO(T21799): PopART onDemand connection will only try to connect to
  // the first device matching the requested config which means if several
  // tests only need 1 IPU, they will all wait on IPU 0.
  // As a workaround we request a new device from PopART and swap the device
  // in the live session.
  session->getDevice().setDeviceInfo(createDevice(/*must_attach*/ true));

  ERROR_ON_MSG(!_device, "Cannot find a valid device");
  ERROR_ON_MSG(!_device->isAttached(), "Still not attached to a device");
  session->getDevice().loadEngineAndConnectStreams();
}

std::string CompilerImpl::getPopartIR() const {
  if (used_ipus.empty()) {
    return "unavailable (No IPUs used)";
  }

  if (session->getExecutable().isDeserialized()) {
    return "unavailable (Cached executable)";
  }
  return session->serializeIr(popart::IrSerializationFormat::JSON);
}

std::set<popart::TensorId> CompilerImpl::getTensorNames() const {
  return session->getAllTensorIds();
}

PopartType CompilerImpl::getPopartType(TensorId id) const {
  if (isHostSideConstant(id)) {
    return getHostSideConstant(id).popartType();
  }

  popart::DataType dtype;
  const auto popart_id = ids[id];
  if (!session) {
    if (!active_builder->hasValueInfo(popart_id)) {
      return PopartType::UNDEFINED;
    }
    dtype = active_builder->getTensorDataType(popart_id);
  } else {
    if (!session->hasInfo(popart_id)) {
      return PopartType::UNDEFINED;
    }
    popart::TensorInfo const info = session->getInfo(popart_id);
    dtype = info.dataType();
  }

#define DEFINE_CASE(value)                                                     \
  case popart::DataType::value: {                                              \
    return PopartType::value;                                                  \
  }

  switch (dtype) { FOR_ALL_POPART_TYPES(DEFINE_CASE) }
#undef DEFINE_CASE

  ERROR("Unsupported popart type in return: " << dtype);
}

void CompilerImpl::cachePopartTypes() {
  for (size_t idx = 1; idx < ids.size(); idx++) {
    ids_types.push_back(getPopartType(idx));
  }
}

void CompilerImpl::errorOnCycleLogging() const {
  ERROR_ON_MSG(popart_options.instrumentWithHardwareCycleCounter,
               "Cycle count logging is only supported on actual IPU hardware.");
}

} // namespace detail
} // namespace popart_compiler
} // namespace poptorch


================================================
FILE: popart_compiler/source/SessionOptions.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#include <chrono>
#include <fstream>
#include <iostream>
#include <list>
#include <map>
#include <memory>
#include <stack>
#include <string>
#include <thread>
#include <unordered_map>
#include <utility>
#include <vector>

#include <popart/adam.hpp>
#include <popart/adaptive.hpp>
#include <popart/builder.hpp>
#include <popart/error.hpp>
#include <popart/graphtransformer.hpp>
#include <popart/half.hpp>
#include <popart/ir.hpp>
#include <popart/ndarraywrapper.hpp>
#include <popart/op/convbase.hpp>
#include <popart/op/identity.hpp>
#include <popart/op/matmul.hpp>
#include <popart/op/nll.hpp>
#include <popart/optimizer.hpp>
#include <popart/popx/devicex.hpp>
#include <popart/popx/devicexmanager.hpp>
#include <popart/session.hpp>
#include <popart/tensorinfo.hpp>
#include <popart/tensors.hpp>

#include "popart_compiler/Compiler.hpp"
#include "popart_compiler/CompilerOptions.hpp"
#include "popart_compiler/PopartEnums.hpp"
#include "popart_compiler/SessionOptionsImpl.hpp"
#include "popart_compiler/Utils.hpp"
#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

namespace {

// To avoid code duplication we use the same std::pair<string,string> inserter
// to add values in map, vector, set containers but in practice only map
// actually takes a pair of values (The others take a single element). So, for
// containers taking only a single value, this magic string should be passed as
// the second element of the pair.
const std::string value_not_set = "__poptorch_value_not_set__";

// Wrapper functor used to print to the debug channel the value
// of the options set by poptorch.Options
template <typename Value> class Setter {
public:
  Setter(std::function<void(Value)> fn, std::string name)
      : _fn(std::move(fn)), _name(std::move(name)) {}
  void operator()(Value value);

private:
  std::function<void(Value)> _fn;
  const std::string _name;
};

template <>
void Setter<std::pair<std::string, std::string>>::operator()(
    std::pair<std::string, std::string> value) { // NOLINT
  _fn(value);
  if (value.second == value_not_set) {
    poptorch::logging::debug("poptorch.Options added {} to {}", value.first,
                             _name);
  } else {
    poptorch::logging::debug("poptorch.Options set {}[{}] to {}", _name,
                             value.first, value.second);
  }
}

template <typename Value> void Setter<Value>::operator()(Value value) {
  _fn(value);
  poptorch::logging::debug("poptorch.Options set {} to value {}", _name, value);
}

template <typename Value, typename Lambda>
void registerSetter(std::map<std::string, std::function<void(Value)>> &options,
                    const std::string &name, Lambda setter) {
  std::function<void(Value)> fn = setter;
  options[name] = Setter<Value>(fn, name);
}

} // namespace

namespace poptorch {
namespace popart_compiler {
namespace detail {

SessionOptionsImpl::SessionOptionsImpl() {
  // The keys must match the name and type of the attributes of SessionOptions
  // in python/__init__.py

  registerSetter(bool_options, "auto_round_num_ipus", [&](bool value) {
    poptorch_options.auto_round_num_ipus = value;
  });

  registerSetter(bool_options, "use_model",
                 [&](bool value) { poptorch_options.ipu_model = value; });

  registerSetter(bool_options, "serial_phases_execution", [&](bool value) {
    poptorch_options.serial_phases_execution = value;
  });
  registerSetter(bool_options, "separate_backward_phase", [&](bool value) {
    poptorch_options.separate_backward_phase = value;
  });
  registerSetter(bool_options, "broadcast_buffers", [&](bool value) {
    poptorch_options.broadcast_buffers = value;
  });
  registerSetter(bool_options, "enableExplicitIR",
                 [&](bool enable) { popart_options.enableExplicitIR(enable); });
  registerSetter(uint64_options, "device_iterations",
                 [&](std::uint64_t value) { poptorch_options.steps = value; });
  registerSetter(uint64_options, "num_distributed_processes",
                 [&](std::uint64_t value) {
                   poptorch_options.num_distributed_processes = value;
                 });
  registerSetter(uint64_options, "distributed_process_id",
                 [&](std::uint64_t value) {
                   poptorch_options.distributed_process_id = value;
                 });
  registerSetter(uint64_options, "ipu_version", [&](std::uint64_t value) {
    poptorch_options.ipu_version = value;
  });
  registerSetter(uint64_options, "ipu_id",
                 [&](std::uint64_t value) { poptorch_options.ipu_id = value; });
  registerSetter(
      uint64_options, "gradient_accumulation",
      [&](std::uint64_t value) { popart_options.accumulationFactor = value; });
  registerSetter(uint64_options, "output_return_period",
                 [&](std::uint64_t value) {
                   poptorch_options.output_return_period = value;
                 });
  registerSetter(uint64_options, "replication_factor",
                 [&](std::uint64_t value) {
                   popart_options.replicatedGraphCount = value;
                 });
  registerSetter(uint64_options, "input_group_size", [&](std::uint64_t value) {
    poptorch_options.input_group_size = static_cast<std::int64_t>(value);
  });
  registerSetter(uint64_options, "input_cgt", [&](std::uint64_t value) {
    poptorch_options.input_cgt = static_cast<popart::CommGroupType>(value);
  });
  registerSetter(uint64_options, "execution_mode", [&](std::uint64_t value) {
    ERROR_ON_MSG(value >= static_cast<std::uint64_t>(ExecutionMode::N),
                 "Value for ExecutionMode out of range");
    poptorch_options.execution_mode = static_cast<ExecutionMode>(value);
  });
  registerSetter(uint64_options, "tensors_liveness", [&](std::uint64_t value) {
    ERROR_ON_MSG(value >= static_cast<std::uint64_t>(Liveness::N),
                 "Value for Liveness out of range");
    poptorch_options.tensors_liveness = static_cast<Liveness>(value);
  });
  registerSetter(uint64_options, "output_mode", [&](std::uint64_t value) {
    ERROR_ON_MSG(value >= static_cast<std::uint64_t>(PopartOutputMode::N),
                 "Value for PopartOutputMode out of range");
    poptorch_options.output_mode = static_cast<PopartOutputMode>(value);
  });

  registerSetter(uint64_options, "connection_type", [&](std::uint64_t value) {
    ERROR_ON_MSG(
        value > static_cast<std::uint64_t>(popart::DeviceConnectionType::Never),
        "Value for DeviceConnectionType out of range");
    poptorch_options.connection_type =
        static_cast<popart::DeviceConnectionType>(value);
  });

  registerSetter(
      uint64_options, "accumulateOuterFragmentSettings.schedule",
      [&](std::uint64_t value) {
        ERROR_ON_MSG(
            value > static_cast<std::uint64_t>(
                        popart::AccumulateOuterFragmentSchedule::
                            OverlapMemoryOptimized),
            "Value for popart::AccumulateOuterFragmentSchedule out of range");
        popart_options.accumulateOuterFragmentSettings.schedule =
            static_cast<popart::AccumulateOuterFragmentSchedule>(value);
      });
  registerSetter(uint64_options, "max_repeat_logs",
                 [&](std::uint64_t value) { logging::setRepeatLimit(value); });
  registerSetter(container_options,
                 "accumulateOuterFragmentSettings.excludedVirtualGraphs",
                 [&](const std::pair<std::string, std::string> &p) {
                   std::int64_t value = std::stoi(p.first);
                   popart_options.accumulateOuterFragmentSettings
                       .excludedVirtualGraphs.push_back(value);
                 });

  registerSetter(uint64_options, "accumulation_and_replication_reduction_type",
                 [&](std::uint64_t value) {
                   ERROR_ON_MSG(value > static_cast<std::uint64_t>(
                                            popart::ReductionType::NoReduction),
                                "Value for popart::ReductionType out of range");
                   popart_options.accumulationAndReplicationReductionType =
                       static_cast<popart::ReductionType>(value);
                 });

  registerSetter(uint64_options, "sync_pattern", [&](std::uint64_t value) {
    ERROR_ON_MSG(value > static_cast<std::uint64_t>(
                             popart::SyncPattern::ReplicaAndLadder),
                 "Value for SyncPattern out of range");
    poptorch_options.sync_pattern = static_cast<popart::SyncPattern>(value);
  });

  registerSetter(uint64_options, "random_seed", [&](std::uint64_t value) {
    poptorch_options.random_seed = value;
  });
  registerSetter(string_options, "log_dir", [&](const std::string &value) {
    popart_options.logDir = value;
  });

  registerSetter(string_options, "saveInitializersToFile",
                 [&](const std::string &value) {
                   poptorch_options.external_initializers_file = value;
                 });

  string_options["logDir"] = [&](const std::string &log_dir) {
    UNUSED(log_dir);
    logging::warn(
        "Ignoring call to poptorch.Options._Popart.set(\"logDir\",...): use "
        "poptorch.Options.logDir() instead");
  };

  registerSetter(string_options, "model_name", [&](const std::string &value) {
    poptorch_options.model_name = value;
  });

  registerSetter(container_options, "dotChecks",
                 [&](const std::pair<std::string, std::string> &p) {
                   popart_options.dotChecks.insert(p.first);
                 });

  registerSetter(container_options, "hardwareInstrumentations",
                 [&](const std::pair<std::string, std::string> &p) {
                   std::uint64_t value = std::stoul(p.first);
                   ERROR_ON_MSG(value >= static_cast<std::uint64_t>(
                                             popart::Instrumentation::N),
                                "Value for Instrumentation out of range");
                   // clang-format off
                   popart_options.hardwareInstrumentations.insert(
                       static_cast<popart::Instrumentation>(value));
                   // clang-format on
                 });

  registerSetter(container_options, "customCodelets",
                 [&](const std::pair<std::string, std::string> &p) {
                   popart_options.customCodelets.push_back(p.first);
                 });

  registerSetter(container_options, "engineOptions",
                 [&](const std::pair<std::string, std::string> &p) {
                   popart_options.engineOptions.emplace(p);
                 });

  registerSetter(container_options, "reportOptions",
                 [&](const std::pair<std::string, std::string> &p) {
                   popart_options.reportOptions.emplace(p);
                 });

  registerSetter(container_options, "convolutionOptions",
                 [&](const std::pair<std::string, std::string> &p) {
                   popart_options.convolutionOptions.emplace(p);
                 });

  registerSetter(container_options, "matmulOptions",
                 [&](const std::pair<std::string, std::string> &p) {
                   popart_options.matmulOptions.emplace(p);
                 });

  registerSetter(container_options, "lstmOptions",
                 [&](const std::pair<std::string, std::string> &p) {
                   popart_options.lstmOptions.emplace(p);
                 });

  registerSetter(container_options, "gclOptions",
                 [&](const std::pair<std::string, std::string> &p) {
                   popart_options.gclOptions.emplace(p);
                 });

  registerSetter(container_options, "updatableNamedBuffers",
                 [&](const std::pair<std::string, std::string> &p) {
                   popart_options.updatableNamedBuffers.push_back(p.first);
                 });

#define ADD_POPART_ENUM_OPTION(name, EnumType)                                 \
  registerSetter(uint64_options, #name, [&](std::uint64_t value) {             \
    ERROR_ON_MSG(value >= static_cast<std::uint64_t>(popart::EnumType::N),     \
                 "Value for " << #EnumType << " out of range");                \
    popart_options.name = static_cast<popart::EnumType>(value);                \
  })

#define ADD_POPART_BOOL_OPTION(name)                                           \
  registerSetter(bool_options, #name,                                          \
                 [&](bool value) { popart_options.name = value; })

#define ADD_POPART_UINT64_OPTION(name)                                         \
  registerSetter(uint64_options, #name,                                        \
                 [&](std::uint64_t value) { popart_options.name = value; })

#define ADD_POPART_DOUBLE_OPTION(name)                                         \
  registerSetter(double_options, #name,                                        \
                 [&](double value) { popart_options.name = value; })

#define ADD_POPART_STRING_OPTION(name)                                         \
  registerSetter(string_options, #name, [&](const std::string &value) {        \
    popart_options.name = value;                                               \
  })

  ADD_POPART_ENUM_OPTION(autodiffSettings.stitchStrategy,
                         AutodiffStitchStrategy);

  ADD_POPART_ENUM_OPTION(batchSerializationSettings.transformContext,
                         BatchSerializationTransformContext);
  ADD_POPART_ENUM_OPTION(batchSerializationSettings.method,
                         BatchSerializationMethod);
  ADD_POPART_ENUM_OPTION(batchSerializationSettings.batchSchedule,
                         BatchSerializationBatchSchedule);
  ADD_POPART_ENUM_OPTION(autoRecomputation, RecomputationType);
  ADD_POPART_ENUM_OPTION(mergeVarUpdate, MergeVarUpdateType);
  ADD_POPART_ENUM_OPTION(virtualGraphMode, VirtualGraphMode);
  ADD_POPART_ENUM_OPTION(syntheticDataMode, SyntheticDataMode);
  ADD_POPART_ENUM_OPTION(subgraphCopyingStrategy, SubgraphCopyingStrategy);
  ADD_POPART_ENUM_OPTION(accumulationAndReplicationReductionType,
                         ReductionType);
  ADD_POPART_ENUM_OPTION(meanAccumulationAndReplicationReductionStrategy,
                         MeanReductionStrategy);
  ADD_POPART_ENUM_OPTION(
      automaticLossScalingSettings.gradientTensorTrackingMethod,
      GradientTensorTrackingMethod);

  ADD_POPART_STRING_OPTION(logDir);
  ADD_POPART_STRING_OPTION(cachePath);
  ADD_POPART_STRING_OPTION(partialsTypeMatMuls);
  ADD_POPART_STRING_OPTION(customCodeletCompileFlags);
  ADD_POPART_STRING_OPTION(serializedPoprithmsShiftGraphsDir);
  ADD_POPART_STRING_OPTION(kahnTieBreaker);

  ADD_POPART_UINT64_OPTION(executionPhaseSettings.phases);
  ADD_POPART_UINT64_OPTION(executionPhaseSettings.stages);
  ADD_POPART_UINT64_OPTION(batchSerializationSettings.factor);
  ADD_POPART_UINT64_OPTION(firstDotOp);
  ADD_POPART_UINT64_OPTION(finalDotOp);
  ADD_POPART_UINT64_OPTION(numIOTiles);
  ADD_POPART_UINT64_OPTION(mergeVarUpdateMemThreshold);
  ADD_POPART_UINT64_OPTION(looseThresholdAtPeak);
  ADD_POPART_UINT64_OPTION(accumulationFactor);
  ADD_POPART_UINT64_OPTION(swapLimitScheduler);
  ADD_POPART_UINT64_OPTION(globalReplicationFactor);
  ADD_POPART_UINT64_OPTION(globalReplicaOffset);
  ADD_POPART_UINT64_OPTION(defaultBufferingDepth);
  ADD_POPART_UINT64_OPTION(defaultPrefetchBufferingDepth);
  ADD_POPART_UINT64_OPTION(compilationProgressTotal);
  ADD_POPART_UINT64_OPTION(transitiveClosureOptimizationThreshold);
  ADD_POPART_UINT64_OPTION(automaticLossScalingSettings.updatePeriod);

  ADD_POPART_BOOL_OPTION(enableInplaceAmbiguityChecking);
  ADD_POPART_BOOL_OPTION(enableLoadAndOffloadRNGState);
  ADD_POPART_BOOL_OPTION(batchSerializationSettings.concatOnVirtualGraphChange);
  ADD_POPART_BOOL_OPTION(
      batchSerializationSettings.concatOnExecutionPhaseChange);
  ADD_POPART_BOOL_OPTION(
      batchSerializationSettings.concatOnPipelineStageChange);
  ADD_POPART_BOOL_OPTION(strictOpVersions);
  ADD_POPART_BOOL_OPTION(opxAliasChecking);
  ADD_POPART_BOOL_OPTION(opxModifyChecking);
  ADD_POPART_BOOL_OPTION(dotOpNames);
  ADD_POPART_BOOL_OPTION(exportPoplarComputationGraph);
  ADD_POPART_BOOL_OPTION(exportPoplarVertexGraph);
  ADD_POPART_BOOL_OPTION(separateCallOpPdfs);
  ADD_POPART_BOOL_OPTION(enableOutlining);
  ADD_POPART_BOOL_OPTION(enableOutliningCopyCostPruning);
  ADD_POPART_BOOL_OPTION(rearrangeAnchorsOnHost);
  ADD_POPART_BOOL_OPTION(rearrangeStreamsOnHost);
  ADD_POPART_BOOL_OPTION(enablePrefetchDatastreams);
  ADD_POPART_BOOL_OPTION(enableNonStableSoftmax);
  ADD_POPART_BOOL_OPTION(enableReplicatedGraphs);
  ADD_POPART_BOOL_OPTION(enableGradientAccumulation);
  ADD_POPART_BOOL_OPTION(instrumentWithHardwareCycleCounter);
  ADD_POPART_BOOL_OPTION(enablePipelining);
  ADD_POPART_BOOL_OPTION(disableGradAccumulationTensorStreams);
  ADD_POPART_BOOL_OPTION(disableOptimizerStateTensorStreams);
  ADD_POPART_BOOL_OPTION(compileEngine);
  ADD_POPART_BOOL_OPTION(constantWeights);
  ADD_POPART_BOOL_OPTION(enableEngineCaching);
  ADD_POPART_BOOL_OPTION(enableMergeExchange);
  ADD_POPART_BOOL_OPTION(enableFloatingPointChecks);
  ADD_POPART_BOOL_OPTION(enableStochasticRounding);
  ADD_POPART_BOOL_OPTION(ensureFp32LossScaleTensor);
  ADD_POPART_BOOL_OPTION(explicitRecomputation);
  ADD_POPART_BOOL_OPTION(enableExplicitMainLoops);
  ADD_POPART_BOOL_OPTION(useHostCopyOps);
  ADD_POPART_BOOL_OPTION(aliasZeroCopy);
  ADD_POPART_BOOL_OPTION(delayVarUpdates);
  ADD_POPART_BOOL_OPTION(enableFullyConnectedPass);
  ADD_POPART_BOOL_OPTION(enableSerializedMatmuls);
  ADD_POPART_BOOL_OPTION(enableStableNorm);
  ADD_POPART_BOOL_OPTION(decomposeGradSum);
  ADD_POPART_BOOL_OPTION(enableDistributedReplicatedGraphs);
  ADD_POPART_BOOL_OPTION(groupHostSync);
  ADD_POPART_BOOL_OPTION(automaticLossScalingSettings.enabled);
  ADD_POPART_BOOL_OPTION(instrumentWithHardwareCycleCounter);
  ADD_POPART_BOOL_OPTION(enableSupportedDataTypeCasting);
  ADD_POPART_BOOL_OPTION(groupNormStridedChannelGrouping);
  ADD_POPART_BOOL_OPTION(scheduleNonWeightUpdateGradientConsumersEarly);
  ADD_POPART_BOOL_OPTION(
      replicatedCollectivesSettings.prepareScheduleForMergingCollectives);
  ADD_POPART_BOOL_OPTION(
      replicatedCollectivesSettings.mergeAllReduceCollectives);

  ADD_POPART_DOUBLE_OPTION(outlineSequenceBreakCost);
  ADD_POPART_DOUBLE_OPTION(outlineThreshold);
  ADD_POPART_DOUBLE_OPTION(timeLimitScheduler);
  ADD_POPART_DOUBLE_OPTION(
      automaticLossScalingSettings.thresholdUpperCountProportion);
  ADD_POPART_DOUBLE_OPTION(automaticLossScalingSettings.binEdgeLocation);

#undef ADD_POPART_STRING_OPTION
#undef ADD_POPART_UINT64_OPTION
#undef ADD_POPART_BOOL_OPTION
#undef ADD_POPART_DOUBLE_OPTION
#undef ADD_POPART_ENUM_OPTION
}

} // namespace detail

SessionOptions::SessionOptions()
    : _impl(std::make_unique<detail::SessionOptionsImpl>()) {}

SessionOptions::SessionOptions(SessionOptions &&src)
    : _impl(std::move(src._impl)) {}

void SessionOptions::addStringOption(const char *option, const char *value) {
  _impl->set<std::string>(option, value, _impl->string_options, "string");
}

void SessionOptions::addUint64Option(const char *option, std::uint64_t value) {
  _impl->set(option, value, _impl->uint64_options, "uint64");
}

void SessionOptions::addBoolOption(const char *option, bool value) {
  _impl->set(option, value, _impl->bool_options, "bool");
}

void SessionOptions::addDoubleOption(const char *option, double value) {
  _impl->set(option, value, _impl->double_options, "floating point");
}

void SessionOptions::insertStringOption(const char *option, const char *value) {
  _impl->set(option, std::pair<std::string, std::string>(value, value_not_set),
             _impl->container_options, "set / vector");
}

void SessionOptions::insertStringPairOption(const char *option, const char *key,
                                            const char *value) {
  _impl->set(option, std::pair<std::string, std::string>(key, value),
             _impl->container_options, "map");
}

bool SessionOptions::broadcastBuffers() const {
  return _impl->poptorch_options.broadcast_buffers;
}

bool SessionOptions::hasInputReplication() const {
  return _impl->poptorch_options.input_group_size <
         _impl->popart_options.replicatedGraphCount;
}

void SessionOptions::setMemoryProportion(std::uint32_t ipu, float memory) {
  _impl->setMemoryProportion(ipu, memory);
}

void SessionOptions::setPatternsLevel(std::uint64_t level) {
  _impl->options_set.insert("patterns");
  ERROR_ON(level > static_cast<std::uint64_t>(popart::PatternsLevel::All));
  _impl->poptorch_options.patterns =
      popart::Patterns(static_cast<popart::PatternsLevel>(level));
}

void SessionOptions::addPattern(const char *pattern, bool enabled) {
  _impl->poptorch_options.patterns.enablePattern(pattern, enabled);
}

void SessionOptions::setTensorLocation(const char *tensor, const char *option,
                                       std::uint64_t value) {
  logging::debug("Setting {} to {} for location {}", option, value, tensor);
  std::string location_tensor{tensor};
  std::string opt{option};
  popart::TensorLocationSettings *settings;
  _impl->options_set.insert(location_tensor);
  if (location_tensor == "location_activation") {
    settings = &_impl->popart_options.activationTensorLocationSettings;
  } else if (location_tensor == "location_weight") {
    settings = &_impl->popart_options.weightTensorLocationSettings;
  } else if (location_tensor == "location_optimizer") {
    settings = &_impl->popart_options.optimizerStateTensorLocationSettings;
  } else if (location_tensor == "location_accumulator") {
    settings = &_impl->popart_options.accumulatorTensorLocationSettings;
  } else {
    ERROR("Unknown tensor location " << location_tensor);
  }

  if (opt == "minElementsForOffChip") {
    settings->minElementsForOffChip = value;
  } else if (opt == "minElementsForReplicatedTensorSharding") {
    settings->minElementsForReplicatedTensorSharding = value;
  } else if (opt == "onChip") {
    settings->location.storage = value > 0 ? popart::TensorStorage::OnChip
                                           : popart::TensorStorage::OffChip;
  } else if (opt == "useReplicatedTensorSharding") {
    settings->location.replicatedTensorSharding =
        value > 0 ? popart::ReplicatedTensorSharding::On
                  : popart::ReplicatedTensorSharding::Off;
  } else if (opt == "useIOTilesToLoad") {
    settings->location.loadTileSet =
        value > 0 ? popart::TileSet::IO : popart::TileSet::Compute;
  } else if (opt == "useIOTilesToStore") {
    settings->location.storageTileSet =
        value > 0 ? popart::TileSet::IO : popart::TileSet::Compute;
  } else {
    ERROR("Unknown option '" << opt << "' for tensor location "
                             << location_tensor);
  }
}

void SessionOptions::setCompilationProgressLogger(
    const std::function<void(int, int)> &logger) {
  _impl->popart_options.compilationProgressLogger = logger;
}

SessionOptions::~SessionOptions() = default;

} // namespace popart_compiler
} // namespace poptorch


================================================
FILE: popart_compiler/source/Utils.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#include <chrono>
#include <thread>

#include <popart/popx/devicex.hpp>
#include <popart/popx/devicexmanager.hpp>
#include <popart/tensorinfo.hpp>

#include "popart_compiler/CompilerImpl.hpp"
#include "popart_compiler/PopartEnums.hpp"
#include "popart_compiler/Utils.hpp"
#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

// These symbols exist in popart but are not declared publicly
namespace ONNX_NAMESPACE {
enum class TensorProto_DataType;
} // namespace ONNX_NAMESPACE

namespace popart {
namespace onnxutil {
DataType getDataType(int);
ONNX_NAMESPACE::TensorProto_DataType getTPDataType(DataType data_type);
} // namespace onnxutil
} // namespace popart

namespace poptorch {
namespace popart_compiler {

bool ipuModelEnvironmentVariableIsEnabled() {
  if (const char *env_use_model = std::getenv("POPTORCH_IPU_MODEL")) {
    const bool model_enabled = std::stoi(env_use_model) != 0;
    logging::info("From POPTORCH_IPU_MODEL environment variable: Ipu model: {}",
                  model_enabled ? "Enabled" : "Disabled");
    return model_enabled;
  }
  return false;
}

bool ipuSmallModelEnvironmentVariableIsEnabled() {
  // POPTORCH_IPU_MODEL takes precedence over the small model.
  if (ipuModelEnvironmentVariableIsEnabled()) {
    return false;
  }
  if (const char *env_use_model = std::getenv("POPTORCH_SMALL_IPU_MODEL")) {
    const bool model_enabled = std::stoi(env_use_model) != 0;
    logging::info("From POPTORCH_SMALL_IPU_MODEL environment variable: small "
                  "Ipu model: {}",
                  model_enabled ? "Enabled" : "Disabled");
    return model_enabled;
  }
  return false;
}

std::string getIpuModelVersion() {
  if (const char *env_ipu_model_version =
          std::getenv("POPTORCH_IPU_MODEL_VERSION")) {
    std::string str(env_ipu_model_version);
    return str;
  }
  return "ipu2"; // Default to MK2 if unspecified
}

int getNumTilesPerIpu(const std::string &ipu_model_version) {
  int num_tiles_per_ipu = 0;

  if (ipu_model_version == "ipu1") {
    num_tiles_per_ipu = 1216; // MK1
  }
  if (ipu_model_version == "ipu2") {
    num_tiles_per_ipu = 1472; // MK2
  }
  if (ipu_model_version == "ipu21") {
    num_tiles_per_ipu = 1472; // C600
  }

  if (ipuSmallModelEnvironmentVariableIsEnabled()) {
    num_tiles_per_ipu = 4;
  }

  ERROR_ON_MSG((ipu_model_version.find("ipu:") == std::string::npos) &&
                   (num_tiles_per_ipu == 0),
               "Invalid IPU model version. Valid versions: ipu1, ipu2, ipu21.");
  return num_tiles_per_ipu;
}

// Round up the number of IPUs, if required, to the minimum number which need
// to be reservered
std::uint64_t roundUpNumIPUs(std::uint64_t num_ipus) {
  std::uint64_t rounded_num_ipus = 1;

  // If fewer than 64, find the next power of 2
  while (rounded_num_ipus < num_ipus) {
    rounded_num_ipus *= 2;
  }
  return rounded_num_ipus;
}

bool waitIfIpuIsUnavailable() {
  bool wait = false;
  if (const char *env_wait_for_ipu = std::getenv("POPTORCH_WAIT_FOR_IPU")) {
    wait = std::stoi(env_wait_for_ipu) != 0;
    logging::info("From POPTORCH_WAIT_FOR_IPU environment variable: If no IPU "
                  "is available: {}",
                  wait ? "Wait" : "Fail & exit");
  }
  return wait;
}

bool waitForAWhile() {
  constexpr std::int64_t sleep_time = 15;
  logging::trace("No IPU available, sleeping for {} seconds", sleep_time);
  std::this_thread::sleep_for(std::chrono::seconds(sleep_time));
  return true;
}

std::int64_t ipuHardwareVersion(std::uint64_t num_ipus) {
  if (ipuModelEnvironmentVariableIsEnabled() ||
      ipuSmallModelEnvironmentVariableIsEnabled()) {
    return 0;
  }
  auto devices = popart::DeviceManager::createDeviceManager().enumerateDevices(
      popart::SyncPattern::Full, num_ipus);
  if (devices.empty()) {
    return 0;
  }
  const std::string arch = devices.front()->getTarget().getTargetArchString();

  // The architecture string must be 'ipu' followed by one or more non-zero
  // digits.
  bool is_valid = arch.size() > 3 && arch.find("ipu", 0) == 0;
  for (size_t i = 3; is_valid && i < arch.size(); ++i) {
    is_valid = arch[i] > '0' && arch[i] <= '9';
  }

  if (!is_valid) {
    logging::warn("Unknown IPU version: {} (Expected 'ipuX' "
                  " where X is one or more strictly positive digits)",
                  arch);
    return -1;
  }
  return std::atoi(arch.substr(3).c_str());
}

std::unique_ptr<char[]> stringToUniquePtr(const std::string &str) {
  auto ptr = std::unique_ptr<char[]>(new char[str.size() + 1]);
  str.copy(ptr.get(), std::string::npos);
  ptr.get()[str.size()] = '\0';
  return ptr;
}

int64_t dtypeIntFromOnnxStr(const char *onnx_type) {
  auto popart_type = popart::dataTypeFromString(onnx_type);
  return static_cast<int64_t>(popart::onnxutil::getTPDataType(popart_type));
}

const char *onnxStrFromDtypeInt(int64_t dtype) {
  auto popart_type = popart::onnxutil::getDataType(dtype);
  const auto &data_type_map(popart::getDataTypeInfoMap());

  // data_type_map is static so the c_str() remains valid
  return data_type_map.at(popart_type).name().c_str();
}

poplar::Type poplarTypeFromPoptorch(PopartType type) {
  const popart::DataType popart_type = popartTypeFromPoptorch(type);
  return popart::popx::popType(popart_type);
}

popart::DataType popartTypeFromPoptorch(PopartType type) {
  switch (type) {
  case PopartType::UINT8:
    return popart::DataType::UINT8;
  case PopartType::INT8:
    return popart::DataType::INT8;
  case PopartType::UINT16:
    return popart::DataType::UINT16;
  case PopartType::INT16:
    return popart::DataType::INT16;
  case PopartType::INT32:
    return popart::DataType::INT32;
  case PopartType::INT64:
    return popart::DataType::INT64;
  case PopartType::UINT32:
    return popart::DataType::UINT32;
  case PopartType::UINT64:
    return popart::DataType::UINT64;
  case PopartType::BOOL:
    return popart::DataType::BOOL;
  case PopartType::FLOAT:
    return popart::DataType::FLOAT;
  case PopartType::FLOAT16:
    return popart::DataType::FLOAT16;
  case PopartType::BFLOAT16:
    return popart::DataType::BFLOAT16;
  case PopartType::FLOAT8_143:
    return popart::DataType::FLOAT8_143;
  case PopartType::FLOAT8_152:
    return popart::DataType::FLOAT8_152;
  case PopartType::DOUBLE:
    return popart::DataType::DOUBLE;
  case PopartType::COMPLEX64:
    return popart::DataType::COMPLEX64;
  case PopartType::COMPLEX128:
    return popart::DataType::COMPLEX128;
  case PopartType::STRING:
    return popart::DataType::STRING;
  case PopartType::UNDEFINED:
    return popart::DataType::UNDEFINED;
  default:
    ERROR("Unsupported type in popartTypeFromPoptorchType");
  }

  return popart::DataType::UNDEFINED;
}
} // namespace popart_compiler
} // namespace poptorch


================================================
FILE: popart_compiler/source/custom_operations/Embedding.cpp
================================================
// Copyright (c) 2021, Graphcore Ltd, All rights reserved.

#include <map>
#include <memory>
#include <set>
#include <tuple>
#include <vector>

#include "popart_compiler/CustomOps.hpp"
#include <popart/op.hpp>
#include <popart/op/gather.hpp>
#include <popart/opmanager.hpp>
#include <popart/opserialiser.hpp>
#include <popart/popx/irlowering.hpp>
#include <popart/popx/op/gatherx.hpp>
#include <popart/popx/op/sliceplanx.hpp>
#include <popart/popx/opx.hpp>
#include <popart/popx/opxmanager.hpp>
#include <popart/vendored/optional.hpp>

#include <popops/DynamicSlice.hpp>
#include <popops/Zero.hpp>

namespace poptorch {
namespace poptorch_custom_ops {

class EmbeddingGradOp;

// EmbeddingOp needs to be convertible to popart::GatherOp so that the tied
// gather pattern can match this implementation.
class EmbeddingOp : public popart::GatherOp {
public:
  EmbeddingOp(const popart::OperatorIdentifier &_opid,
              const nonstd::optional<int64_t> &padding_idx,
              const nonstd::optional<float> &available_memory_proportion_,
              const popart::Op::Settings &settings_)
      : popart::GatherOp(_opid, /*axis=*/0, /*groupSize=*/1, settings_,
                         available_memory_proportion_),
        _padding_idx(padding_idx) {}

  std::unique_ptr<popart::Op> clone() const final {
    return std::make_unique<EmbeddingOp>(*this);
  }

  std::vector<std::unique_ptr<popart::Op>> getGradOps() final {
    std::vector<std::unique_ptr<popart::Op>> result;
    result.emplace_back(std::make_unique<EmbeddingGradOp>(*this));
    return result;
  }

  static popart::InIndex weightInIndex() { return 0; }
  static popart::InIndex indicesInIndex() { return 1; }
  static popart::OutIndex outIndex() { return 0; }

  void appendOutlineAttributes(popart::OpSerialiserBase &os) const final {
    popart::GatherOp::appendOutlineAttributes(os);
    os.appendAttribute("padding_idx", paddingIndex());
  }

  nonstd::optional<int64_t> paddingIndex() const { return _padding_idx; }

private:
  nonstd::optional<int64_t> _padding_idx;
};

class EmbeddingGradOp : public popart::Op {
public:
  explicit EmbeddingGradOp(const EmbeddingOp &fwd_op)
      : popart::Op(poptorch_custom_ops::embedding_grad, fwd_op.getSettings()),
        _padding_idx(fwd_op.paddingIndex()),
        _available_memory_proportion(fwd_op.getAvailableMemoryProportion()),
        _wieght_info(fwd_op.inInfo(EmbeddingOp::weightInIndex())) {}

  std::unique_ptr<popart::Op> clone() const final {
    return std::make_unique<EmbeddingGradOp>(*this);
  }

  const std::vector<popart::GradInOutMapper> &gradInputInfo() const final {
    static const std::vector<popart::GradInOutMapper> info = {
        {gradInIndex(), EmbeddingOp::outIndex(), popart::GradOpInType::GradOut},
        {indicesInIndex(), EmbeddingOp::indicesInIndex(),
         popart::GradOpInType::In}};

    return info;
  }

  const std::map<int, int> &gradOutToNonGradIn() const final {
    static const std::map<int, int> out = {
        {gradOutIndex(), EmbeddingOp::weightInIndex()}};

    return out;
  }

  void setup() final { outInfo(gradOutIndex()) = _wieght_info; }

  static popart::InIndex gradInIndex() { return 0; }
  static popart::InIndex indicesInIndex() { return 1; }
  static popart::OutIndex gradOutIndex() { return 0; }

  void appendOutlineAttributes(popart::OpSerialiserBase &os) const final {
    popart::Op::appendOutlineAttributes(os);
    os.appendAttribute("padding_idx", paddingIndex());
    os.appendAttribute(popart::sAvailMemAttribute, availableMemoryProportion());
  }

  float getSubgraphValue() const final { return getLowSubgraphValue(); }

  nonstd::optional<int64_t> paddingIndex() const { return _padding_idx; }

  nonstd::optional<float> availableMemoryProportion() const {
    return _available_memory_proportion;
  }

private:
  nonstd::optional<int64_t> _padding_idx;
  nonstd::optional<float> _available_memory_proportion;
  popart::TensorInfo _wieght_info;
};

namespace {
popart::OpDefinition::DataTypes weight_dtypes = {
    popart::DataType::UINT8,   popart::DataType::UINT16,
    popart::DataType::UINT32,  popart::DataType::UINT64,
    popart::DataType::INT8,    popart::DataType::INT16,
    popart::DataType::INT32,   popart::DataType::INT64,
    popart::DataType::FLOAT16, popart::DataType::FLOAT};

popart::OpDefinition::DataTypes indices_dtypes = {
    popart::DataType::UINT8,  popart::DataType::UINT16,
    popart::DataType::UINT32, popart::DataType::UINT64,
    popart::DataType::INT8,   popart::DataType::INT16,
    popart::DataType::INT32,  popart::DataType::INT64};

popart::OpDefinition
    embedding_def({popart::OpDefinition::Inputs({{"weight", weight_dtypes},
                                                 {"indices", indices_dtypes}}),
                   popart::OpDefinition::Outputs({{"output", weight_dtypes}}),
                   popart::OpDefinition::Attributes({
                       {"padding_idx", {"*"}},
                   })});

popart::OpCreator<EmbeddingOp> embedding_creator(
    popart::OpDefinitions({{poptorch_custom_ops::embedding, embedding_def}}),
    [](const popart::OpCreatorInfo &info) {
      nonstd::optional<int64_t> padding_idx;

      if (info.attributes.hasAttribute("padding_idx")) {
        padding_idx = info.attributes.getAttribute<popart::Attributes::Int>(
            "padding_idx");
      }

      nonstd::optional<float> available_memory_proportion;

      if (info.attributes.hasAttribute(popart::sAvailMemAttribute)) {
        available_memory_proportion =
            info.attributes.getAttribute<popart::Attributes::Float>(
                popart::sAvailMemAttribute);
      }

      return std::unique_ptr<popart::Op>(new EmbeddingOp(
          info.opid, padding_idx, available_memory_proportion, info.settings));
    },
    true);

} // namespace

class EmbeddingOpx : public popart::popx::Opx {
public:
  EmbeddingOpx(popart::Op *op, popart::popx::Devicex *devicex)
      : popart::popx::Opx(op, devicex) {
    verifyOp<EmbeddingOp>(op, {poptorch_custom_ops::embedding});

    // We always want the EmbeddingOpx to layout its inputs
    inputCreatorPriority = std::numeric_limits<double>::max();

    auto options = popart::popx::createSlicePlanOptions(
        popart::popx::SlicePlanUsedFor::Slice,
        getOp<EmbeddingOp>().getAvailableMemoryProportion());

    _plan = popart::popx::createSlicePlan(
        graph(), inInfo(EmbeddingOp::weightInIndex()),
        inInfo(EmbeddingOp::indicesInIndex()), options, /*axis=*/0);
  }

  void grow(poplar::program::Sequence &prog) const final {
    auto weight = getInTensor(EmbeddingOp::weightInIndex());
    auto indices = getInTensor(EmbeddingOp::indicesInIndex());

    // Assume non-negative indices.
    indices = indices.reinterpret(poplar::UNSIGNED_INT);
    indices = indices.flatten();
    indices = indices.expand({1});

    auto result = popops::multiSlice(graph(), weight, indices, {0}, {1}, prog,
                                     _plan, poplar::OptionFlags());

    result = result.reshape(outInfo(EmbeddingOp::outIndex()).shape_szt());
    setOutTensor(EmbeddingOp::outIndex(), result);
  }

  poplar::Tensor
  createInputTensor(popart::InIndex index,
                    const poplar::DebugNameAndId &dnai) const final {
    if (index != EmbeddingOp::weightInIndex() &&
        index != EmbeddingOp::indicesInIndex()) {
      throw popart::error(
          "EmbeddingOpx::createInputTensor : Invalid index = {}", index);
    }

    if (index == EmbeddingOp::weightInIndex()) {
      const auto &weight_info = inInfo(index);
      auto weight = popops::createSliceableTensor(
          graph(), popart::popx::popType(weight_info), weight_info.shape_szt(),
          {0}, {1}, _plan, poplar::OptionFlags(), dnai);

      return weight;
    }

    const auto &indices_info = inInfo(index);
    auto num_lookups = static_cast<std::size_t>(indices_info.nelms());
    auto indices = popops::createIndicesTensor(graph(), {0}, num_lookups, _plan,
                                               poplar::OptionFlags(), dnai);

    indices = indices.reinterpret(popart::popx::popType(indices_info));
    indices = indices.reshape(indices_info.shape_szt());
    return indices;
  }

  popart::popx::InputCreatorType
  getInputCreatorType(popart::InIndex index) const final {
    if (index == EmbeddingOp::weightInIndex() ||
        index == EmbeddingOp::indicesInIndex()) {
      return popart::popx::InputCreatorType::CanCreate;
    }

    return Opx::getInputCreatorType(index);
  }

  std::set<popart::TensorId>
  mustExistBeforeCreate(popart::InIndex index) const final {
    (void)index; // unused
    return {};
  }

private:
  popops::SlicePlan _plan;
};

class EmbeddingGradOpx : public popart::popx::Opx {
public:
  EmbeddingGradOpx(popart::Op *op, popart::popx::Devicex *devicex)
      : popart::popx::Opx(op, devicex) {
    verifyOp<EmbeddingGradOp>(op, {poptorch_custom_ops::embedding_grad});

    // We always want the EmbeddingGradOpx to layout its inputs
    inputCreatorPriority = std::numeric_limits<double>::max();

    auto grad_op = getOp<EmbeddingGradOp>();
    _padding_idx = grad_op.paddingIndex();

    auto options = popart::popx::createSlicePlanOptions(
        popart::popx::SlicePlanUsedFor::UpdateAdd,
        grad_op.availableMemoryProportion());

    _plan = popart::popx::createSlicePlan(
        graph(), outInfo(EmbeddingGradOp::gradOutIndex()),
        inInfo(EmbeddingGradOp::indicesInIndex()), options, /*axis=*/0);
  }

  void grow(poplar::program::Sequence &prog) const final {
    auto grad_in = getInTensor(EmbeddingGradOp::gradInIndex());
    auto indices = getInTensor(EmbeddingGradOp::indicesInIndex());
    auto output_shape = outInfo(EmbeddingGradOp::gradOutIndex()).shape_szt();

    auto out = popops::createSliceableTensor(
        graph(), grad_in.elementType(), output_shape, {0}, {1}, _plan,
        poplar::OptionFlags(), debugContext("embedding_grad_out"));

    popops::zero(graph(), out, prog, debugContext("zero"));

    auto scale = graph().addConstant(grad_in.elementType(), {}, 1.0f,
                                     debugContext("const_1"));
    graph().setTileMapping(scale, 0);

    auto inputs = popart::popx::GatherGradOpx::handleNDMultiUpdate(
        out, grad_in, indices, 0, 1);
    auto &target_nd = std::get<0>(inputs);
    auto &update_nd = std::get<1>(inputs);
    auto &indices_nd = std::get<2>(inputs);

    popops::multiUpdateAdd(graph(), target_nd, update_nd, indices_nd, scale,
                           {0}, {1}, prog, _plan, poplar::OptionFlags(),
                           debugContext("embedding_grad"));

    if (_padding_idx) {
      auto start = static_cast<std::size_t>(*_padding_idx);
      auto padding = out.slice(start, start + 1, 0);
      popops::zero(graph(), padding, prog, debugContext("zero_padding_idx"));
    }

    setOutTensor(EmbeddingGradOp::gradOutIndex(), out);
  }

  poplar::Tensor
  createInputTensor(popart::InIndex index,
                    const poplar::DebugNameAndId &dnai) const final {
    if (index != EmbeddingGradOp::gradInIndex() &&
        index != EmbeddingGradOp::indicesInIndex()) {
      throw popart::error(
          "EmbeddingGradOpx::createInputTensor : Invalid index = {}", index);
    }

    if (index == EmbeddingGradOp::gradInIndex()) {
      const auto &grad_info = inInfo(index);
      auto weight = popops::createSliceableTensor(
          graph(), popart::popx::popType(grad_info), grad_info.shape_szt(), {0},
          {1}, _plan, poplar::OptionFlags(), dnai);

      return weight;
    }

    const auto &indices_info = inInfo(index);
    auto num_lookups = static_cast<std::size_t>(indices_info.nelms());
    auto indices = popops::createIndicesTensor(graph(), {0}, num_lookups, _plan,
                                               poplar::OptionFlags(), dnai);

    indices = indices.reinterpret(popart::popx::popType(indices_info));
    indices = indices.reshape(indices_info.shape_szt());
    return indices;
  }

  popart::popx::InputCreatorType
  getInputCreatorType(popart::InIndex index) const final {
    if (index == EmbeddingGradOp::gradInIndex() ||
        index == EmbeddingGradOp::indicesInIndex()) {
      return popart::popx::InputCreatorType::CanCreate;
    }

    return Opx::getInputCreatorType(index);
  }

  std::set<popart::TensorId>
  mustExistBeforeCreate(popart::InIndex index) const final {
    (void)index; // unused
    return {};
  }

private:
  nonstd::optional<int64_t> _padding_idx;
  popops::SlicePlan _plan;
};

namespace {
popart::popx::OpxCreator<EmbeddingOpx>
    embedding_opx(poptorch_custom_ops::embedding);
popart::popx::OpxCreator<EmbeddingGradOpx>
    embedding_grad_opx(poptorch_custom_ops::embedding_grad);
} // namespace

} // namespace poptorch_custom_ops
} // namespace poptorch


================================================
FILE: popart_compiler/source/custom_operations/FastGatherLastDim.cpp
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
#include <functional>

#include <ostream>
#include <popart/popx/opxmanager.hpp>
#include <poputil/VertexTemplates.hpp>
#include <poputil/exceptions.hpp>

#include "FastGatherLastDim.hpp"
#include "popart_compiler/CodeletsCompilation.hpp"
#include "popart_compiler/CustomOps.hpp"
#include "popart_compiler/Utils.hpp"

namespace poptorch {
namespace poptorch_custom_ops {

FastGatherLastDimOp::FastGatherLastDimOp(
    const popart::OperatorIdentifier &opid_,
    const popart::Op::Settings &settings_, const std::string &debug_str)
    : popart::Op(opid_, settings_) {
  this->_axis = -1;
  this->_debug_str = debug_str;
}

std::vector<std::unique_ptr<popart::Op>> FastGatherLastDimOp::getGradOps() {
  std::vector<std::unique_ptr<popart::Op>> upops;
  upops.emplace_back(std::make_unique<FastGatherLastDimGradOp>(*this));

  return upops;
}

std::unique_ptr<popart::Op> FastGatherLastDimOp::clone() const {
  return std::make_unique<FastGatherLastDimOp>(*this);
}

void FastGatherLastDimOp::setup() {
  if (popart_compiler::ipuModelEnvironmentVariableIsEnabled() ||
      popart_compiler::ipuSmallModelEnvironmentVariableIsEnabled()) {
    throw popart::error(
        "FastGatherLastDimOp requires hardware but IPU model is enabled");
  }

  popart::Shape data_shape = this->inInfo(0).shape();
  popart::Shape idx_shape = this->inInfo(1).shape();
  popart::Shape out_shape = data_shape;

  // idx rank and data rank should be the same
  if (data_shape.size() != idx_shape.size()) {
    throw popart::error(
        "FastGatherLastDimOp::setup(), "
        "Input and Index tensors do not have same rank in Op {}",
        this->getDebugStr());
  }

  // idx should have same dimensions as data except for last dim
  const int data_rank = static_cast<int>(data_shape.size());
  for (unsigned i = 0; i < data_shape.size() - 1; i++) {
    if (idx_shape[i] != data_shape[i]) {
      throw popart::error("FastGatherLastDimOp::setup(), "
                          "Index tensor must have same dimensions as Input "
                          "except for last dim. Op {}",
                          this->getDebugStr());
    }
  }

  int axis = this->_axis;
  if (axis < 0) {
    axis = data_rank + axis;
  }
  for (unsigned i = 0; i < data_shape.size(); i++) {
    if (static_cast<unsigned>(axis) != i) {
      out_shape[i] = data_shape[i];
    }
  }

  out_shape[axis] = idx_shape[axis];
  this->_in_shape = data_shape;
  this->_out_shape = out_shape;
  this->outInfo(0) = {this->inInfo(0).dataType(), out_shape};
}

// register op
static popart::OpDefinition::DataTypes fast_gather_op_data_tensor_type = {
    popart::DataType::FLOAT16, popart::DataType::FLOAT};
static popart::OpDefinition::DataTypes fast_gather_op_idx_tensor_type = {
    popart::DataType::INT32, popart::DataType::INT16};

static popart::OpDefinition fast_gather_op_def(
    {popart::OpDefinition::Inputs({
         {"data", fast_gather_op_data_tensor_type},
         {"index", fast_gather_op_idx_tensor_type},
     }),
     popart::OpDefinition::Outputs({{"out", fast_gather_op_data_tensor_type}}),
     popart::OpDefinition::Attributes({})});

static popart::OpCreator<FastGatherLastDimOp> fast_gather_op_creator(
    popart::OpDefinitions({{poptorch_custom_ops::fast_gather_last_dim,
                            fast_gather_op_def}}),
    [](const popart::OpCreatorInfo &info) -> std::unique_ptr<popart::Op> {
      popart::OperatorIdentifier const &opid = info.opid;
      popart::Op::Settings const &settings = info.settings;
      popart::Attributes const &attr = info.attributes;
      std::string const debug_str =
          attr.getAttribute<popart::Attributes::String>("debug_str",
                                                        "fast_gather_last_dim");
      return std::unique_ptr<popart::Op>(
          new FastGatherLastDimOp(opid, settings, debug_str));
    },
    true);

FastGatherLastDimOpx::FastGatherLastDimOpx(popart::Op *op,
                                           popart::popx::Devicex *devicex)
    : popart::popx::Opx(op, devicex) {
  verifyOp<FastGatherLastDimOp>(op, poptorch_custom_ops::fast_gather_last_dim);

  // Get around the ABI issues.
  auto managed_ptr = popart_compiler::compileCustomCodeletIfNeeded(
      "FastGatherLastDimFwdCodelets.inc.cpp", /*hw_only_codelet=*/true);
  const char *compiled_codelet_path =
      static_cast<const char *>(managed_ptr.get());
  graph().addCodelets(std::string(compiled_codelet_path));
}

void FastGatherLastDimOpx::grow(poplar::program::Sequence &prog) const {
  auto data_tensor = getInTensor(0);
  auto idx_tensor = getInTensor(1);

  const FastGatherLastDimOp &fast_gather_last_dim_op =
      getOp<FastGatherLastDimOp>();
  popart::Shape fwd_op_out_shape = fast_gather_last_dim_op.getOutShape();

  std::vector<std::size_t> fwd_out_shape(fwd_op_out_shape.size());
  for (unsigned i = 0; i < fwd_op_out_shape.size(); i++) {
    fwd_out_shape[i] = fwd_op_out_shape[i];
  }

  poplar::Tensor const out_tensor =
      addGraphProg(graph(), prog, data_tensor, idx_tensor, fwd_out_shape);

  setOutTensor(0, out_tensor);
}

poplar::Tensor FastGatherLastDimOpx::addGraphProg(
    poplar::Graph &graph, poplar::program::Sequence &prog,
    const poplar::Tensor &data_tensor, const poplar::Tensor &idx_tensor,
    const std::vector<std::size_t> &fwd_out_shape) {

  poplar::Tensor output_tensor =
      graph.addVariable(data_tensor.elementType(), fwd_out_shape, "sel_out");
  auto target = graph.getTarget();
  const unsigned num_tiles = target.getNumTiles();
  const unsigned out_rank = idx_tensor.rank();

  std::size_t alloc_cnt = 1;
  std::size_t channel_cnt = 1;
  for (unsigned i = 0; i < out_rank; i++) {
    if (i < out_rank - 1) {
      alloc_cnt = alloc_cnt * fwd_out_shape[i];
    }
    if (i < out_rank - 2) {
      channel_cnt = channel_cnt * fwd_out_shape[i];
    }
  }
  auto in_shape = data_tensor.shape();
  auto out_shape = fwd_out_shape;

  poplar::ComputeSet const gather_cs = graph.addComputeSet("FastGatherCS");
  std::vector<unsigned> tile_start(num_tiles, 0);
  std::vector<unsigned> tile_count(num_tiles, 0);

  poplar::Tensor const data_tensor_clone = graph.clone(data_tensor);
  poplar::Tensor const data_tensor_reshape =
      data_tensor_clone.reshape({alloc_cnt, in_shape[out_rank - 1]});

  poplar::Tensor const idx_tensor_clone = graph.clone(idx_tensor);
  poplar::Tensor const idx_tensor_reshape =
      idx_tensor_clone.reshape({alloc_cnt, out_shape[out_rank - 1]});
  poplar::Tensor const result_tensor_reshape =
      output_tensor.reshape({alloc_cnt, out_shape[out_rank - 1]});

  std::size_t tile_idx_last = 1;
  for (std::size_t i = 0; i < alloc_cnt; ++i) {
    std::size_t const idx = (i * num_tiles) / alloc_cnt;
    graph.setTileMapping(data_tensor_reshape[i], idx);
    graph.setTileMapping(idx_tensor_reshape[i], idx);
    graph.setTileMapping(result_tensor_reshape[i], idx);
    if (tile_idx_last != idx) {
      tile_start[idx] = i;
    }
    tile_count[idx] += 1;
    tile_idx_last = idx;
  }
  prog.add(poplar::program::Copy(data_tensor, data_tensor_clone));
  prog.add(poplar::program::Copy(idx_tensor, idx_tensor_clone));

  for (unsigned i = 0; i < num_tiles; ++i) {
    if (0 == tile_count[i]) {
      continue;
    }

    poplar::VertexRef const gather_vertex = graph.addVertex(
        gather_cs,
        poputil::templateVertex("FastGatherVertex", data_tensor.elementType(),
                                idx_tensor.elementType()),
        {{"data_", data_tensor_reshape.slice(tile_start[i],
                                             tile_start[i] + tile_count[i])},
         {"idx_", idx_tensor_reshape.slice(tile_start[i],
                                           tile_start[i] + tile_count[i])},
         {"result_", result_tensor_reshape.slice(
                         tile_start[i], tile_start[i] + tile_count[i])}});
    graph.setTileMapping(gather_vertex, i);
    graph.setInitialValue(gather_vertex["dst_shape_"], out_shape);
  }

  prog.add(poplar::program::Execute(gather_cs));

  return output_tensor;
}

FastGatherLastDimGradOp::FastGatherLastDimGradOp(
    const FastGatherLastDimOp &fwdOp)
    : popart::Op(poptorch_custom_ops::fast_gather_last_dim_grad,
                 fwdOp.getSettings()) {
  this->_axis = -1;
  this->_fwd_in_shape = fwdOp.getInShape();
  this->_debug_str = fwdOp.getDebugStr();
}

std::unique_ptr<popart::Op> FastGatherLastDimGradOp::clone() const {
  return std::make_unique<FastGatherLastDimGradOp>(*this);
}

FastGatherLastDimGradOpx::FastGatherLastDimGradOpx(
    popart::Op *op, popart::popx::Devicex *devicex)
    : popart::popx::Opx(op, devicex) {
  verifyOp<FastGatherLastDimGradOp>(
      op, poptorch_custom_ops::fast_gather_last_dim_grad);

  // Get around the ABI issues.
  auto managed_ptr = popart_compiler::compileCustomCodeletIfNeeded(
      "FastGatherLastDimBwdCodelets.inc.cpp", /*hw_only_codelet=*/true);
  const char *compiled_codelet_path =
      static_cast<const char *>(managed_ptr.get());
  graph().addCodelets(std::string(compiled_codelet_path));
}

void FastGatherLastDimGradOpx::grow(poplar::program::Sequence &prog) const {
  poplar::Tensor const grad_output_tensor = getInTensor(0);
  poplar::Tensor const idx_tensor = getInTensor(1);

  const FastGatherLastDimGradOp &grad_op = getOp<FastGatherLastDimGradOp>();
  popart::Shape fwd_in_shape = grad_op.getFwdInShape();
  std::vector<std::size_t> fwd_in_shape_2(fwd_in_shape.size());
  for (unsigned i = 0; i < fwd_in_shape.size(); i++) {
    fwd_in_shape_2[i] = static_cast<std::size_t>(fwd_in_shape[i]);
  }

  auto zero = getScalarVariable(grad_output_tensor.elementType(), "zero");
  graph().setInitialValue(zero, 0);
  auto output = zero;
  for (unsigned i = 0; i < fwd_in_shape.size(); ++i) {
    output = output.expand({0});
  }
  for (unsigned i = 0; i < fwd_in_shape.size(); ++i) {
    output = output.broadcast(static_cast<unsigned>(fwd_in_shape[i]), i);
  }

  auto out_tensor = cloneNcopy(prog, output);

  poplar::Tensor const grad_input_tensor =
      addGraphProg(graph(), prog, grad_output_tensor, out_tensor,
                   fwd_in_shape_2, idx_tensor);

  setOutTensor(0, out_tensor);
}

poplar::Tensor FastGatherLastDimGradOpx::addGraphProg(
    poplar::Graph &graph, poplar::program::Sequence &prog,
    const poplar::Tensor &grad_output_tensor, poplar::Tensor &grad_input_tensor,
    const std::vector<std::size_t> &fwd_in_shape,
    const poplar::Tensor &idx_tensor) {

  auto target = graph.getTarget();
  const unsigned num_tiles = target.getNumTiles();
  const unsigned grad_output_rank = grad_output_tensor.rank();

  std::size_t alloc_cnt = 1;
  std::size_t channel_cnt = 1;
  for (unsigned i = 0; i < grad_output_rank; i++) {
    if (i < grad_output_rank - 1) {
      alloc_cnt = alloc_cnt * grad_output_tensor.dim(i);
    }
    if (i < grad_output_rank - 2) {
      channel_cnt = channel_cnt * grad_output_tensor.dim(i);
    }
  }
  auto grad_output_shape = grad_output_tensor.shape();
  auto grad_input_shape = fwd_in_shape;

  poplar::ComputeSet const gather_grad_cs =
      graph.addComputeSet("FastGatherGradCS");
  std::vector<unsigned> tile_start(num_tiles, 0);
  std::vector<unsigned> tile_count(num_tiles, 0);

  poplar::Tensor const grad_output_tensor_clone =
      graph.clone(grad_output_tensor);

  poplar::Tensor const grad_output_tensor_reshape =
      grad_output_tensor_clone.reshape(
          {alloc_cnt, grad_output_shape[grad_output_rank - 1]});

  poplar::Tensor const idx_tensor_clone = graph.clone(idx_tensor);
  poplar::Tensor const idx_tensor_reshape = idx_tensor_clone.reshape(
      {alloc_cnt, grad_output_shape[grad_output_rank - 1]});

  poplar::Tensor const grad_input_tensor_reshape = grad_input_tensor.reshape(
      {alloc_cnt, grad_input_shape[grad_output_rank - 1]});

  std::size_t tile_idx_last = 1;
  for (std::size_t i = 0; i < alloc_cnt; ++i) {
    std::size_t const idx = (i * num_tiles) / alloc_cnt;
    graph.setTileMapping(grad_output_tensor_reshape[i], idx);
    graph.setTileMapping(idx_tensor_reshape[i], idx);
    graph.setTileMapping(grad_input_tensor_reshape[i], idx);
    if (tile_idx_last != idx) {
      tile_start[idx] = i;
    }
    tile_count[idx] += 1;
    tile_idx_last = idx;
  }
  prog.add(poplar::program::Copy(idx_tensor, idx_tensor_clone));
  prog.add(poplar::program::Copy(grad_output_tensor, grad_output_tensor_clone));

  for (unsigned i = 0; i < num_tiles; ++i) {
    if (0 == tile_count[i]) {
      continue;
    }

    poplar::VertexRef const gather_vertex = graph.addVertex(
        gather_grad_cs,
        poputil::templateVertex("FastGatherGradVertex",
                                grad_output_tensor.elementType(),
                                idx_tensor.elementType()),
        {{"grad_out_", grad_output_tensor_reshape.slice(
                           tile_start[i], tile_start[i] + tile_count[i])},
         {"idx_", idx_tensor_reshape.slice(tile_start[i],
                                           tile_start[i] + tile_count[i])},
         {"grad_in_", grad_input_tensor_reshape.slice(
                          tile_start[i], tile_start[i] + tile_count[i])}});
    graph.setTileMapping(gather_vertex, i);
    graph.setInitialValue(gather_vertex["grad_out_shape_"], grad_output_shape);
    graph.setInitialValue(gather_vertex["grad_in_shape_"], grad_input_shape);
  }

  prog.add(poplar::program::Execute(gather_grad_cs));

  return grad_input_tensor;
}

namespace {
popart::popx::OpxCreator<FastGatherLastDimOpx>
    fast_gather_last_dim_opx(poptorch_custom_ops::fast_gather_last_dim);
popart::popx::OpxCreator<FastGatherLastDimGradOpx>
    fast_gather_last_dim_grad_opx(
        poptorch_custom_ops::fast_gather_last_dim_grad);
} // namespace

} // namespace poptorch_custom_ops
} // namespace poptorch


================================================
FILE: popart_compiler/source/custom_operations/FastGatherLastDim.hpp
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
#ifndef GUARD_POPTORCH_STATICGATHER_HPP
#define GUARD_POPTORCH_STATICGATHER_HPP

#include <map>
#include <memory>
#include <string>
#include <vector>

#include <popart/names.hpp>
#include <popart/op.hpp>
#include <popart/opmanager.hpp>
#include <popart/popx/devicex.hpp>
#include <popart/popx/opx.hpp>

namespace poptorch {
namespace poptorch_custom_ops {

class FastGatherLastDimOp : public popart::Op {
public:
  FastGatherLastDimOp(const popart::OperatorIdentifier &opid_,
                      const popart::Op::Settings &settings_,
                      const std::string &debug_str);

  FastGatherLastDimOp(const FastGatherLastDimOp &) = default;
  FastGatherLastDimOp &operator=(const FastGatherLastDimOp &) = delete;
  ~FastGatherLastDimOp() override = default;

  std::vector<std::unique_ptr<Op>> getGradOps() final;
  std::unique_ptr<Op> clone() const final;
  void setup() final;
  float getSubgraphValue() const final { return getHighSubgraphValue(); }
  int64_t getAxis() const { return _axis; }
  popart::Shape getInShape() const { return _in_shape; }
  popart::Shape getOutShape() const { return _out_shape; }
  const std::string &getDebugStr() const { return _debug_str; }

private:
  int64_t _axis;
  popart::Shape _in_shape;
  popart::Shape _out_shape;
  std::string _debug_str;
};

class FastGatherLastDimOpx : public popart::popx::Opx {
public:
  FastGatherLastDimOpx(popart::Op *, popart::popx::Devicex *);
  ~FastGatherLastDimOpx() override = default;

  void grow(poplar::program::Sequence &prog) const final;

private:
  static poplar::Tensor
  addGraphProg(poplar::Graph &graph, poplar::program::Sequence &prog,
               const poplar::Tensor &data_tensor,
               const poplar::Tensor &idx_tensor,
               const std::vector<std::size_t> &fwd_out_shape);
};

class FastGatherLastDimGradOp : public popart::Op {
public:
  explicit FastGatherLastDimGradOp(const FastGatherLastDimOp &fwdOp);

  std::unique_ptr<Op> clone() const final;
  virtual void setup() {
    this->outInfo(0) = {this->inInfo(0).dataType(), _fwd_in_shape};
  }

  /* Describes the relationship of the inputs of the grad op to the
     inputs/outputs of the non-grad op */
  virtual const std::vector<popart::GradInOutMapper> &gradInputInfo() const {
    static const std::vector<popart::GradInOutMapper> in_info = {
        // The input of grad op at index 0 is the gradient of the output at
        // index 0 of the non-grad op
        {0, 0, popart::GradOpInType::GradOut},

        // The input of grad op at index 1 is the input at index 1
        // of the non-grad op
        {1, 1, popart::GradOpInType::In}};
    return in_info;
  }

  /* Describes the relationship of the outputs of the grad op to the
     inputs/outputs of the non-grad op */
  virtual const std::map<int, int> &gradOutToNonGradIn() const {
    static const std::map<int, int> out_info = {
        // The output at index 0 is dLhs, i.e the gradient of the input at index
        // 0 of non-grad op
        {0, 0},
    };
    return out_info;
  }

  float getSubgraphValue() const final { return getLowSubgraphValue(); }
  const std::string &getDebugStr() const { return _debug_str; }
  popart::Shape getFwdInShape() const { return _fwd_in_shape; }

private:
  int64_t _axis;
  popart::Shape _fwd_in_shape;
  std::string _debug_str;
};

class FastGatherLastDimGradOpx : public popart::popx::Opx {
public:
  FastGatherLastDimGradOpx(popart::Op *op, popart::popx::Devicex *devicex);
  ~FastGatherLastDimGradOpx() override = default;

  void grow(poplar::program::Sequence &prog) const final;

private:
  static poplar::Tensor
  addGraphProg(poplar::Graph &graph, poplar::program::Sequence &prog,
               const poplar::Tensor &grad_output_tensor,
               poplar::Tensor &grad_input_tensor,
               const std::vector<std::size_t> &fwd_in_shape,
               const poplar::Tensor &idx_tensor);
};

} // namespace poptorch_custom_ops
} // namespace poptorch

#endif


================================================
FILE: popart_compiler/source/custom_operations/FastGatherLastDimBwdCodelets.inc.cpp
================================================
// Copyright (c) 2022, Graphcore Ltd, All rights reserved.
#ifdef __IPU__
#include <ipu_vector_math>
#else
  #error Not supported on IPU Model
#endif
#include <poplar/HalfFloat.hpp>
#include <poplar/Vertex.hpp>

using namespace poplar;

static constexpr auto ONE_PTR = poplar::VectorLayout::ONE_PTR;

template<typename FT, typename IT>
struct FloatDef{
};

template<>
struct FloatDef<float, int>{
  typedef   float2    FVType;
  typedef   int2      IVType;

  static inline constexpr float2   kZeroV       = { 0.0f, 0.0f };
};

template<>
struct FloatDef<float, short>{
  typedef   float2    FVType;
  typedef   short2    IVType;
  static inline constexpr float2   kZeroV       = { 0.0f, 0.0f };
};

template<>
struct FloatDef<half, int>{
  typedef   half4     FVType;
  typedef   int2      IVType;
  static inline constexpr half4   kZeroV       = { 0.0f, 0.0f, 0.0f, 0.0f };
};

template<>
struct FloatDef<half, short>{
  typedef   half4     FVType;
  typedef   short4    IVType;
  static inline constexpr half4   kZeroV       = { 0.0f, 0.0f, 0.0f, 0.0f };
};

template<typename FT>
struct OutputDef{
};

template<>
struct OutputDef<float>{
  typedef Vector<InOut<Vector<float, ONE_PTR, 8>>>  OutputType;
};

template<>
struct OutputDef<half>{
  typedef Vector<InOut<Vector<half, ONE_PTR, 8>>>   OutputType;
};


template <class FloatType, typename IdxType> class FastGatherGradVertex : public Vertex {
public:
  FastGatherGradVertex() ;

  Vector<Input<Vector<FloatType, ONE_PTR, 8>>>  grad_out_;
  Vector<Input<Vector<IdxType, ONE_PTR, 8>>>    idx_;
  //Vector<InOut<Vector<FloatType, ONE_PTR, 8>>>  grad_in_;
  typename OutputDef<FloatType>::OutputType         grad_in_;

  const Vector<int>                             grad_out_shape_;
  const Vector<int>                             grad_in_shape_;

  template<typename FT, typename IT, typename std::enable_if<std::is_same<FT, float>::value, void>::type* = nullptr>
  static void run(Vector<Input<Vector<FT, ONE_PTR, 8>>> const&       grad_out,
                  Vector<Input<Vector<IdxType, ONE_PTR, 8>>> const&  idx,
                  typename OutputDef<FT>::OutputType&           grad_in,
                  Vector<int> const&                                 grad_out_shape,
                  Vector<int> const&                                 grad_in_shape)
  {
    int  c                      = grad_out.size();
    int  grad_out_dim_size      = grad_out_shape[grad_out_shape.size() - 1];
    int  grad_out_dim_size_half = grad_out_dim_size >> 1;
    int  grad_out_dim_size2     = grad_out_dim_size_half << 1;
    int  grad_in_dim_size       = grad_in_shape[grad_out_shape.size() - 1];
    int  grad_in_dim_size_half  = grad_in_dim_size >> 1;
    int  grad_in_dim_size2      = grad_in_dim_size_half << 1;
    for(int i = 0 ; i < c ; i ++)
    {
      typename FloatDef<FT, IT>::FVType const*  cur_grad_out_ptr2 = (typename FloatDef<FT, IT>::FVType*)(&(grad_out[i][0]));
      typename FloatDef<FT, IT>::IVType const*  cur_idx_ptr2      = (typename FloatDef<FT, IT>::IVType*)(&(idx[i][0]));
      typename FloatDef<FT, IT>::FVType*        cur_grad_in_ptr2  = (typename FloatDef<FT, IT>::FVType*)(&(grad_in[i][0]));

      FT const*   cur_grad_out_ptr  = (FT*)cur_grad_out_ptr2;
      IT const*   cur_idx_ptr       = (IT const*)cur_idx_ptr2;
      FT*         cur_grad_in_ptr   = (FT*)cur_grad_in_ptr2;
      int         j                 = 0;
      for(j = 0 ; j < grad_out_dim_size_half ; j ++)
      {
        typename FloatDef<FT, IT>::FVType  cur_grad_out = cur_grad_out_ptr2[j];
        typename FloatDef<FT, IT>::IVType  idx          = cur_idx_ptr2[j];
        cur_grad_in_ptr[idx[0]] += cur_grad_out[0];
        cur_grad_in_ptr[idx[1]] += cur_grad_out[1];
      }
      if(0 != (grad_out_dim_size & 1))
      {
        FT   cur_grad_out     = cur_grad_out_ptr[grad_out_dim_size2];
        IT   idx              = cur_idx_ptr[grad_out_dim_size2];
        cur_grad_in_ptr[idx] += cur_grad_out;
      }
    }
  };

  template<typename FT, typename IT, typename std::enable_if<std::is_same<FT, half>::value, void>::type* = nullptr>
  static void run(Vector<Input<Vector<FT, ONE_PTR, 8>>> const&  grad_out,
                  Vector<Input<Vector<IT, ONE_PTR, 8>>> const&  idx,
                  typename OutputDef<FT>::OutputType&      grad_in,
                  Vector<int> const&                            grad_out_shape,
                  Vector<int> const&                            grad_in_shape)
  {
    int  c                      = grad_out.size();
    int  grad_out_dim_size      = grad_out_shape[grad_out_shape.size() - 1];
    int  grad_out_dim_size_q    = grad_out_dim_size >> 2;
    int  grad_out_dim_size4     = grad_out_dim_size_q << 2;
    int  grad_in_dim_size       = grad_in_shape[grad_out_shape.size() - 1];
    int  grad_in_dim_size_q     = grad_in_dim_size >> 2;
    int  grad_in_dim_size4      = grad_out_dim_size_q << 2;
    for(int i = 0 ; i < c ; i ++)
    {
      typename FloatDef<FT, IT>::FVType const*   cur_grad_out_ptr4 = (typename FloatDef<FT, IT>::FVType*)(&(grad_out[i][0]));
      typename FloatDef<FT, IT>::IVType const*   cur_idx_ptr2      = (typename FloatDef<FT, IT>::IVType const*)(&(idx[i][0]));
      typename FloatDef<FT, IT>::FVType*         cur_grad_in_ptr4  = (typename FloatDef<FT, IT>::FVType*)(&(grad_in[i][0]));

      FT const*   cur_grad_out_ptr  = (FT*)cur_grad_out_ptr4;
      IT const*   cur_idx_ptr       = (IT const*)cur_idx_ptr2;
      FT*         cur_grad_in_ptr   = (FT*)cur_grad_in_ptr4;
      int          j                = 0;
      for(j = 0 ; j < grad_out_dim_size_q ; j ++)
      {
        typename FloatDef<FT, IT>::FVType   cur_grad_out = cur_grad_out_ptr4[j];
        typename FloatDef<FT, IT>::IVType   idx0         = cur_idx_ptr2[2 * j];
        typename FloatDef<FT, IT>::IVType   idx1         = cur_idx_ptr2[2 * j + 1];
        cur_grad_in_ptr[idx0[0]] += cur_grad_out[0];
        cur_grad_in_ptr[idx0[1]] += cur_grad_out[1];
        cur_grad_in_ptr[idx1[0]] += cur_grad_out[2];
        cur_grad_in_ptr[idx1[1]] += cur_grad_out[3];
      }
      for(j = grad_out_dim_size4 ; j < grad_out_dim_size ; j ++)
      {
        FT    cur_grad_out  = cur_grad_out_ptr[j];
        IT    idx           = cur_idx_ptr[j];
        cur_grad_in_ptr[idx] += cur_grad_out;
      }
    }
  }

  bool compute() {
    run<FloatType, IdxType>(grad_out_, idx_, grad_in_, grad_out_shape_, grad_in_shape_);
    return true;
  }
};

template class FastGatherGradVertex<float, int>;
template class FastGatherGradVertex<half, int>;


================================================
FILE: popart_compiler/source/custom_operations/FastGatherLastDimFwdCodelets.inc.cpp
================================================
// Copyright (c) 2022, Graphcore Ltd, All rights reserved.
#ifdef __IPU__
#include <ipu_vector_math>
#else
  #error Not supported on IPU Model
#endif
#include <poplar/HalfFloat.hpp>
#include <poplar/Vertex.hpp>

using namespace poplar;

static constexpr auto ONE_PTR = poplar::VectorLayout::ONE_PTR;

template<typename FT, typename IT>
struct FloatDef{
};

template<>
struct FloatDef<float, int>{
  typedef   float2    FVType;
  typedef   int2      IVType;

  static inline constexpr float2   kZeroV       = { 0.0f, 0.0f };
};

template<>
struct FloatDef<float, short>{
  typedef   float2    FVType;
  typedef   short2    IVType;
  static inline constexpr float2   kZeroV       = { 0.0f, 0.0f };
};

template<>
struct FloatDef<half, int>{
  typedef   half4     FVType;
  typedef   int2      IVType;
  static inline constexpr half4   kZeroV       = { 0.0f, 0.0f, 0.0f, 0.0f };
};

template<>
struct FloatDef<half, short>{
  typedef   half4     FVType;
  typedef   short4    IVType;
  static inline constexpr half4   kZeroV       = { 0.0f, 0.0f, 0.0f, 0.0f };
};

template <typename FloatType, typename IdxType> class FastGatherVertex : public Vertex {
public:
  FastGatherVertex() ;

  Vector<Input<Vector<FloatType, ONE_PTR, 8>>>   data_;
  Vector<Input<Vector<IdxType, ONE_PTR, 8>>>     idx_;
  Vector<Output<Vector<FloatType, ONE_PTR, 8>>>  result_;

  const Vector<int>                              dst_shape_;

  template<typename FT, typename IT, typename std::enable_if<std::is_same<FT, float>::value, void>::type* = nullptr>
  static void run(Vector<Input<Vector<FT, ONE_PTR, 8>>> const&   data,
                  Vector<Input<Vector<IT, ONE_PTR, 8>>> const&   idx,
                  Vector<Output<Vector<FT, ONE_PTR, 8>>>&        result,
                  Vector<int> const&                             dst_shape)
  {
    int           c                = data.size();
    int           out_dim_size     = dst_shape[dst_shape.size() - 1];
    int           out_dim_size_v_r = out_dim_size >> 1;
    int           out_dim_size_v   = out_dim_size_v_r << 1;
    for(int i = 0 ; i < c ; i ++)
    {
      typename FloatDef<FT, IT>::FVType const*  cur_data_ptrv  = (typename FloatDef<FT, IT>::FVType const*)(&(data[i][0]));
      typename FloatDef<FT, IT>::IVType const*  cur_idx_ptrv   = (typename FloatDef<FT, IT>::IVType const*)(&(idx[i][0]));
      typename FloatDef<FT, IT>::FVType*        cur_out_ptrv   = (typename FloatDef<FT, IT>::FVType*)(&(result[i][0]));
      float const*   cur_data_ptr   = (float const*)cur_data_ptrv;
      int const*     cur_idx_ptr    = (int const*)cur_idx_ptrv;
      float*         cur_out_ptr    = (float*)cur_out_ptrv;
      int            j              = 0;
      for(j = 0 ; j < out_dim_size_v_r ; j ++)
      {
        typename FloatDef<FT, IT>::IVType  idx      = cur_idx_ptrv[j];
        typename FloatDef<FT, IT>::FVType  cur_val  = { cur_data_ptr[idx[0]], cur_data_ptr[idx[1]] };
        cur_out_ptrv[j]  = cur_val;
      }
      if(0 != (out_dim_size & 1))
      {
        int idx = cur_idx_ptr[out_dim_size_v];
        cur_out_ptr[out_dim_size_v] = cur_data_ptr[idx];
      }
    }
  };

  template<typename FT, typename IT, typename std::enable_if<std::is_same<FT, half>::value, void>::type* = nullptr>
  static void run(Vector<Input<Vector<FT, ONE_PTR, 8>>> const&   data,
                  Vector<Input<Vector<IT, ONE_PTR, 8>>> const&   idx,
                  Vector<Output<Vector<FT, ONE_PTR, 8>>>&        result,
                  Vector<int> const&                             dst_shape)
  {
    int           c                = data.size();
    int           out_dim_size     = dst_shape[dst_shape.size() - 1];
    int           out_dim_size_v_r = out_dim_size >> 2;
    int           out_dim_size_v   = out_dim_size_v_r << 2;
    for(int i = 0 ; i < c ; i ++)
    {
      typename FloatDef<FT, IT>::FVType const*  cur_data_ptrv = (typename FloatDef<FT, IT>::FVType const*)(&(data[i][0]));
      typename FloatDef<FT, IT>::IVType const*  cur_idx_ptrv  = (typename FloatDef<FT, IT>::IVType const*)(&(idx[i][0]));
      typename FloatDef<FT, IT>::FVType*        cur_out_ptrv  = (typename FloatDef<FT, IT>::FVType*)(&(result[i][0]));
      FT const*                                 cur_data_ptr  = (FT const*)cur_data_ptrv;
      IT const*                                 cur_idx_ptr   = (IT const*)cur_idx_ptrv;
      FT*                                       cur_out_ptr   = (FT*)cur_out_ptrv;
      int            j              = 0;
      for(j = 0 ; j < out_dim_size_v_r ; j ++)
      {
        typename FloatDef<FT, IT>::IVType   idx0      = cur_idx_ptrv[2 * j];
        typename FloatDef<FT, IT>::IVType   idx1      = cur_idx_ptrv[2 * j + 1];
        typename FloatDef<FT, IT>::FVType   cur_val   = { cur_data_ptr[idx0[0]],
                                                          cur_data_ptr[idx0[1]],
                                                          cur_data_ptr[idx1[0]],
                                                          cur_data_ptr[idx1[1]] };
        cur_out_ptrv[j]   = cur_val;
      }
      for(j = out_dim_size_v ; j < out_dim_size ; j ++)
      {
        IT idx = cur_idx_ptr[j];
        cur_out_ptr[j] = cur_data_ptr[idx];
      }
    }
  };

  bool compute() {
    run<FloatType, IdxType>(data_, idx_, result_, dst_shape_);
    return true;
  }
};

template class FastGatherVertex<float, int>;
template class FastGatherVertex<half, int>;


================================================
FILE: popart_compiler/source/custom_operations/HostOp.cpp
================================================
// Copyright (c) 2021 Graphcore Ltd. All rights reserved.

#include <memory>
#include <popart/builder.hpp>
#include <popart/devicemanager.hpp>
#include <popart/iarray.hpp>

#include <popart/ir.hpp>

#include <popart/logging.hpp>
#include <popart/op.hpp>
#include <popart/opmanager.hpp>
#include <popart/opserialiser.hpp>
#include <popart/optimizer.hpp>
#include <popart/patterns/pattern.hpp>
#include <popart/popx/opx.hpp>
#include <popart/popx/opxmanager.hpp>
#include <popart/session.hpp>
#include <popart/shapeinference.hpp>
#include <popart/tensordata.hpp>
#include <popart/tensorinfo.hpp>
#include <popart/tensornames.hpp>

#include <popart/names.hpp>
#include <popart/operators.hpp>

#include "popart_compiler/CompilerImpl.hpp"
#include "popart_compiler/CustomOps.hpp"

namespace poptorch {
namespace poptorch_custom_ops {

const char host_op_metadata_attr[] = "func_info";

} // namespace poptorch_custom_ops

/*
 * A popart custom operation to handle Host operations. Takes a callback and
 * sets up the IPU->CPU communication for the needed tensors.
 */
namespace popart_compiler {

namespace {

// Get the popart type info for a given output from the stream metadata.
popart::TensorInfo shapeInferOutput(detail::CallbackInternalMetadata *func_info,
                                    std::uint32_t i) {
  // Get type and shape from metadata.
  const popart::DataType type =
      popartTypeFromPoptorch(func_info->output_types[i]);
  const std::vector<std::size_t> &shape = func_info->output_shapes[i];

  // Convert from the poptorch/poplar type (std::size_t) to the popart one
  // (std::uint64_t).
  popart::Shape as_popart_shape;
  as_popart_shape.reserve(shape.size());
  for (std::size_t elem : shape) {
    as_popart_shape.push_back(elem);
  }

  // Create popart info.
  return popart::TensorInfo{type, as_popart_shape};
}

detail::CallbackInternalMetadata *
getMetadataFromAttributeMap(const popart::Attributes &attrs) {
  // Pointer smuggled in via an integer.
  std::int64_t as_int = attrs.getAttribute<popart::Attributes::Int>(
      poptorch_custom_ops::host_op_metadata_attr);

  logging::trace("Pointer retrieved by CPU op {}", as_int);

  std::intptr_t as_ptr = static_cast<std::intptr_t>(as_int);

  logging::trace("Casted from {} to {}", as_int, as_ptr);

  // Cast to the correct type.
  // NOLINTNEXTLINE performance-no-int-to-ptr
  return reinterpret_cast<detail::CallbackInternalMetadata *>(as_ptr);
}

} // namespace
/*
  Popart custom op which uses the metadata gathered by the compiler to setup
  poplar tensors and copy into/from them from/to host.
*/
class HostOp : public popart::Op {
public:
  HostOp(const popart::OperatorIdentifier &_opid,
         detail::CallbackInternalMetadata *info,
         const popart::Op::Settings &settings_)
      : popart::Op(_opid, settings_), func_info(info) {}

  // Configure the output popart Tensor
  void setup() override {
    // Tell popart what the output should look like.
    for (std::uint32_t i = 0; i < func_info->output_types.size(); ++i) {
      outInfo(i) = shapeInferOutput(func_info, i);
    }
  }

  std::unique_ptr<Op> clone() const final {
    return std::make_unique<HostOp>(*this);
  }

  float getSubgraphValue() const final { return getLowSubgraphValue(); }

  detail::CallbackInternalMetadata *func_info;
};

class HostOpx : public popart::popx::Opx {
public:
  HostOpx(popart::Op *op, popart::popx::Devicex *devicex)
      : popart::popx::Opx(op, devicex) {
    verifyOp<HostOp>(op, poptorch_custom_ops::host_op);

    func_info = dynamic_cast<HostOp *>(op)->func_info;
  }

  void grow(poplar::program::Sequence &sequence) const override {
    poplar::Graph &graph = this->graph();

    // Get basic op info from metadata.
    const std::uint32_t num_inputs = func_info->input_types.size();
    const std::uint32_t num_outputs = func_info->output_types.size();

    // For each input create the FIFO and copy from it into the poplar tensor
    // popart has already created/
    std::vector<poplar::Graph::HostFunctionArgument> input_args;
    std::vector<poplar::Tensor> inputs;
    inputs.reserve(num_inputs);
    input_args.reserve(num_inputs);
    for (std::uint32_t input_index = 0; input_index < num_inputs;
         ++input_index) {
      // poplar::Tensor from popart.
      poplar::Tensor input_tensor = getInTensor(input_index);
      inputs.push_back(input_tensor);
      input_args.emplace_back(input_tensor.elementType(),
                              input_tensor.numElements());
    }

    std::vector<poplar::Graph::HostFunctionArgument> output_args;
    std::vector<poplar::Tensor> outputs;
    outputs.reserve(num_outputs);
    output_args.reserve(num_outputs);
    for (std::uint32_t output = 0; output < num_outputs; ++output) {
      const poplar::Type type =
          poplarTypeFromPoptorch(func_info->output_types[output]);

      const std::vector<std::size_t> &shape = func_info->output_shapes[output];

      // Add the poplar tensor.
      std::string name = func_info->handle + "::out" + std::to_string(output);
      poplar::Tensor output_tensor = graph.addVariable(
          type, shape, poplar::VariableMappingMethod::LINEAR, std::move(name));

      outputs.push_back(output_tensor);
      output_args.emplace_back(output_tensor.elementType(),
                               output_tensor.numElements());

      // Tell popart this is the output.
      setOutTensor(output, output_tensor);
    }

    poplar::HostFunction hf =
        graph.addHostFunction(func_info->handle, input_args, output_args);
    sequence.add(poplar::program::Call(hf, inputs, outputs));
  }

  detail::CallbackInternalMetadata *func_info;
};

} // namespace popart_compiler
} // namespace poptorch

static popart::OpCreator<poptorch::popart_compiler::HostOp> host_op_creator(
    {{poptorch::poptorch_custom_ops::host_op, {}}},
    [](const popart::OpCreatorInfo &info) {
      // Get the stream info from the attribute map we passed to
      // create the op.
      auto *func_info = poptorch::popart_compiler::getMetadataFromAttributeMap(
          info.attributes);

      return std::unique_ptr<popart::Op>(new poptorch::popart_compiler::HostOp(
          info.opid, func_info, info.settings));
    },
    true);

static popart::popx::OpxCreator<poptorch::popart_compiler::HostOpx>
    host_opx_creator(poptorch::poptorch_custom_ops::host_op);

static popart::RegisterShapeInferenceFunction host_op_shape_inference(
    poptorch::poptorch_custom_ops::host_op,
    [](popart::ShapeInferenceContext &ctx) {
      // Get the stream info from the attribute map we passed to create the op.
      auto *func_info = poptorch::popart_compiler::getMetadataFromAttributeMap(
          ctx.getAttributes());

      // Tell popart what the output should look like.
      for (std::uint32_t i = 0; i < func_info->output_types.size(); ++i) {
        ctx.outInfo(i) =
            poptorch::popart_compiler::shapeInferOutput(func_info, i);
      }
    });


================================================
FILE: popart_compiler/source/custom_operations/TorchSoftplus.cpp
================================================
// Copyright (c) 2021 Graphcore Ltd. All rights reserved.

#include "TorchSoftplus.hpp"
#include "popart_compiler/CustomOps.hpp"

#include <popart/opmanager.hpp>
#include <popart/opserialiser.hpp>
#include <popart/popx/devicex.hpp>
#include <popart/popx/op/softplusx.hpp>
#include <popart/popx/opxmanager.hpp>
#include <poplar/Graph.hpp>
#include <popnn/NonLinearity.hpp>
#include <popops/ElementWise.hpp>

namespace poptorch {
namespace poptorch_custom_ops {
TorchSoftplusOp::TorchSoftplusOp(const popart::OperatorIdentifier &opid_,
                                 float beta, float threshold,
                                 const popart::Op::Settings &opSettings)
    : popart::ElementWiseUnaryOp(opid_, opSettings), _beta(beta),
      _threshold(threshold) {}

std::unique_ptr<popart::Op> TorchSoftplusOp::clone() const {
  return std::make_unique<TorchSoftplusOp>(*this);
}

std::vector<std::unique_ptr<popart::Op>> TorchSoftplusOp::getGradOps() {
  std::vector<std::unique_ptr<popart::Op>> result;
  result.emplace_back(std::make_unique<TorchSoftplusGradOp>(*this));
  return result;
}

std::vector<std::tuple<popart::OperatorIdentifier, float>>
TorchSoftplusOp::inplacePriorityDefault() const {
  // see T6768: choosing default inplace priorities
  return {{poptorch_custom_ops::torch_softplus_inplace, 10}};
}

std::unique_ptr<popart::Op> TorchSoftplusOp::getInplaceVariant(
    const popart::OperatorIdentifier &operator_id) const {
  if (operator_id == poptorch_custom_ops::torch_softplus_inplace) {
    return std::make_unique<TorchSoftplusInplaceOp>(*this);
  }
  return popart::Op::getInplaceVariant(operator_id);
}

void TorchSoftplusOp::appendOutlineAttributes(
    popart::OpSerialiserBase &os) const {
  popart::Op::appendOutlineAttributes(os);
  os.appendAttribute("beta", beta());
  os.appendAttribute("threshold", threshold());
}

TorchSoftplusInplaceOp::TorchSoftplusInplaceOp(const TorchSoftplusOp &op)
    : popart::ElementWiseInplaceUnaryOp(
          poptorch_custom_ops::torch_softplus_inplace, op.getSettings()),
      _beta(op.beta()), _threshold(op.threshold()) {}

std::unique_ptr<popart::Op> TorchSoftplusInplaceOp::clone() const {
  return std::make_unique<TorchSoftplusInplaceOp>(*this);
}

void TorchSoftplusInplaceOp::appendOutlineAttributes(
    popart::OpSerialiserBase &os) const {
  popart::Op::appendOutlineAttributes(os);
  os.appendAttribute("beta", beta());
  os.appendAttribute("threshold", threshold());
}

TorchSoftplusGradOp::TorchSoftplusGradOp(const TorchSoftplusOp &fwd_op)
    : popart::ElementWiseNonLinearUnaryGradOp(
          poptorch_custom_ops::torch_softplus_grad, fwd_op),
      _beta(fwd_op.beta()), _threshold(fwd_op.threshold()) {}

std::unique_ptr<popart::Op> TorchSoftplusGradOp::clone() const {
  return std::make_unique<TorchSoftplusGradOp>(*this);
}

void TorchSoftplusGradOp::appendOutlineAttributes(
    popart::OpSerialiserBase &os) const {
  popart::Op::appendOutlineAttributes(os);
  os.appendAttribute("beta", beta());
  os.appendAttribute("threshold", threshold());
}

namespace {
popart::OpDefinition::DataTypes dtypes = {
    popart::DataType::UINT8,   popart::DataType::UINT16,
    popart::DataType::UINT32,  popart::DataType::UINT64,
    popart::DataType::INT8,    popart::DataType::INT16,
    popart::DataType::INT32,   popart::DataType::INT64,
    popart::DataType::FLOAT16, popart::DataType::FLOAT};

popart::OpDefinition
    softplus_def({popart::OpDefinition::Inputs({{"input", dtypes}}),
                  popart::OpDefinition::Outputs({{"output", dtypes}}),
                  popart::OpDefinition::Attributes({{"beta", {"*"}},
                                                    {"threshold", {"*"}}})});

popart::OpCreator<TorchSoftplusOp> softplus_creator(
    popart::OpDefinitions({{poptorch_custom_ops::torch_softplus,
                            softplus_def}}),
    [](const popart::OpCreatorInfo &info) {
      float const beta =
          info.attributes.getAttribute<popart::Attributes::Float>("beta", 1.0);
      float const threshold =
          info.attributes.getAttribute<popart::Attributes::Float>("threshold",
                                                                  1.0);
      return std::unique_ptr<popart::Op>(
          new TorchSoftplusOp(info.opid, beta, threshold, info.settings));
    },
    true);

} // namespace

namespace pe = popops::expr;

template <class T>
std::unique_ptr<popart::popx::EwuComputex> create(popart::Op *op) {
  auto *x = dynamic_cast<T *>(op);
  if (x == nullptr) {
    throw popart::error("Invalid torch softplus operator.");
  }

  return TorchSoftplusComputex::get(x->beta(), x->threshold());
}

TorchSoftplusOpx::TorchSoftplusOpx(popart::Op *op,
                                   popart::popx::Devicex *devicex)
    : ElementWiseUnaryOutplaceOpx(op, devicex, create<TorchSoftplusOp>(op)) {
  verifyOp<TorchSoftplusOp>(op, {poptorch_custom_ops::torch_softplus});
}

void TorchSoftplusComputex::inplace(poplar::program::Sequence &prog,
                                    poplar::Graph &graph,
                                    const poplar::Tensor &tensor,
                                    const poplar::DebugNameAndId &dnai,
                                    const std::string &prefix) const {
  // Torch Softplus definition:
  //  1/beta * log[1 + exp(beta * x)] for beta * x <= threshold
  //                                x for beta * x > threshold
  //
  // To avoid overflow when evaluating the exp, we use the following equivalent
  // formula for softplus:
  // 1/beta * log[1 + exp(-abs(beta * x))] + max(x, 0)
  (void)prefix; // unused input parameter
  using ExprPtr = std::unique_ptr<pe::Expr>;
  std::vector<ExprPtr> exprs;
  exprs.push_back(std::make_unique<pe::PlaceHolder>(pe::_1));

  if (_beta != 1.0f) {
    exprs.push_back(std::make_unique<pe::Mul>(pe::Const(_beta), *exprs.back()));
  }

  auto &bx = *exprs.back();

  // log1p(-exp(|beta * x|))
  exprs.push_back(std::make_unique<pe::Exp>(-pe::Abs(*exprs.back())));
  exprs.push_back(std::make_unique<pe::Log1p>(*exprs.back()));

  if (_beta != 1.0f) {
    exprs.push_back(
        std::make_unique<pe::Divide>(*exprs.back(), pe::Const(_beta)));
  }

  // 1/beta * log1p(-exp(|beta * x|)) + max(x, 0)
  exprs.push_back(std::make_unique<pe::Add>(*exprs.back(),
                                            pe::Max(pe::_1, pe::Const(0.0f))));

  // beta * x <= threshold ? 1/beta * log1p(-exp(|beta * x|)) + max(x, 0) : x
  exprs.push_back(std::make_unique<pe::Select>(*exprs.back(), pe::_1,
                                               bx <= pe::Const(_threshold)));

  popops::mapInPlace(graph, *exprs.back(), {tensor}, prog,
                     {dnai, "torch_softplus"});
}

std::unique_ptr<popart::popx::EwuComputex>
TorchSoftplusComputex::get(float beta, float threshold) {
  return std::make_unique<TorchSoftplusComputex>(beta, threshold);
}

TorchSoftplusInplaceOpx::TorchSoftplusInplaceOpx(popart::Op *op,
                                                 popart::popx::Devicex *devicex)
    : ElementWiseUnaryInplaceOpx(op, devicex,
                                 create<TorchSoftplusInplaceOp>(op)) {
  verifyOp<TorchSoftplusInplaceOp>(op,
                                   poptorch_custom_ops::torch_softplus_inplace);
}

TorchSoftplusGradOpx::TorchSoftplusGradOpx(popart::Op *op,
                                           popart::popx::Devicex *devicex)
    : Opx(op, devicex), _beta(), _threshold() {
  verifyOp<TorchSoftplusGradOp>(op, poptorch_custom_ops::torch_softplus_grad);
  auto &grad_op = getOp<TorchSoftplusGradOp>();
  _beta = grad_op.beta();
  _threshold = grad_op.threshold();
}

void TorchSoftplusGradOpx::grow(poplar::program::Sequence &prog) const {
  // The derivative of the softplus activation function is:
  //
  // exp(beta*x)/(exp(beta*x) + 1) = 1/(exp(-beta*x) + 1) = sigmoid(beta*x)
  //
  // To match the Torch definition:
  //
  // grad_out = grad_in * sigmoid(beta*x) for beta * x <= threshold
  //            grad_in                   for beta * x > threshold
  const auto &grad_in = getInTensor(TorchSoftplusGradOp::getGradInIndex());
  const auto &fwd_input = getInTensor(TorchSoftplusGradOp::getFwdArgInIndex());

  using ExprPtr = std::unique_ptr<pe::Expr>;
  std::vector<ExprPtr> exprs;
  exprs.push_back(std::make_unique<pe::PlaceHolder>(pe::_2));

  if (_beta != 1.0f) {
    exprs.push_back(std::make_unique<pe::Mul>(pe::Const(_beta), *exprs.back()));
  }

  auto &bx = *exprs.back();

  // grad_in * sigmoid(beta*x)
  exprs.push_back(std::make_unique<pe::Mul>(pe::_1, pe::Sigmoid(bx)));

  // beta * x <= threshold ? grad_in * sigmoid(beta*x) : grad_in
  exprs.push_back(std::make_unique<pe::Select>(*exprs.back(), pe::_1,
                                               bx <= pe::Const(_threshold)));

  auto output = popops::map(graph(), *exprs.back(), {grad_in, fwd_input}, prog,
                            debugContext("torch_softplus_grad"));

  setOutTensor(TorchSoftplusGradOp::getOutIndex(), output);
}

namespace {
popart::popx::OpxCreator<TorchSoftplusOpx>
    softplus_opx(poptorch_custom_ops::torch_softplus);
popart::popx::OpxCreator<TorchSoftplusInplaceOpx>
    softplus_inplace_opx(poptorch_custom_ops::torch_softplus_inplace);
popart::popx::OpxCreator<TorchSoftplusGradOpx>
    softplus_grad_opx(poptorch_custom_ops::torch_softplus_grad);
} // namespace

} // namespace poptorch_custom_ops
} // namespace poptorch


================================================
FILE: popart_compiler/source/custom_operations/TorchSoftplus.hpp
================================================
// Copyright (c) 2021 Graphcore Ltd. All rights reserved.
#ifndef GUARD_POPTORCH_SOFTPLUS_HPP
#define GUARD_POPTORCH_SOFTPLUS_HPP

#include <memory>
#include <string>
#include <tuple>
#include <vector>

#include <popart/names.hpp>
#include <popart/op/elementwise.hpp>
#include <popart/popx/op/elementwisex.hpp>

namespace poptorch {
namespace poptorch_custom_ops {

class TorchSoftplusOp : public popart::ElementWiseUnaryOp {
public:
  TorchSoftplusOp(const popart::OperatorIdentifier &opid, float beta,
                  float threshold, const popart::Op::Settings &settings);

  std::unique_ptr<popart::Op> clone() const final;
  std::vector<std::unique_ptr<popart::Op>> getGradOps() final;

  std::vector<std::tuple<popart::OperatorIdentifier, float>>
  inplacePriorityDefault() const final;

  std::unique_ptr<popart::Op>
  getInplaceVariant(const popart::OperatorIdentifier &id) const final;

  void appendOutlineAttributes(popart::OpSerialiserBase &os) const final;

  float beta() const { return _beta; }
  float threshold() const { return _threshold; }

private:
  float _beta;
  float _threshold;
};

class TorchSoftplusInplaceOp : public popart::ElementWiseInplaceUnaryOp {
public:
  explicit TorchSoftplusInplaceOp(const TorchSoftplusOp &op);
  std::unique_ptr<popart::Op> clone() const final;

  void appendOutlineAttributes(popart::OpSerialiserBase &os) const final;

  float beta() const { return _beta; }
  float threshold() const { return _threshold; }

private:
  float _beta;
  float _threshold;
};

class TorchSoftplusGradOp : public popart::ElementWiseNonLinearUnaryGradOp {
public:
  explicit TorchSoftplusGradOp(const TorchSoftplusOp &fwd_op);
  std::unique_ptr<popart::Op> clone() const final;

  void appendOutlineAttributes(popart::OpSerialiserBase &os) const final;

  float beta() const { return _beta; }
  float threshold() const { return _threshold; }

private:
  float _beta;
  float _threshold;
};

class TorchSoftplusComputex : public popart::popx::EwuComputex {
public:
  TorchSoftplusComputex(float beta, float threshold)
      : _beta(beta), _threshold(threshold) {}

  void inplace(poplar::program::Sequence &prog, poplar::Graph &graph,
               const poplar::Tensor &tensor, const poplar::DebugNameAndId &dnai,
               const std::string &prefix) const final;

  static std::unique_ptr<popart::popx::EwuComputex> get(float beta,
                                                        float threshold);

private:
  float _beta;
  float _threshold;
};

class TorchSoftplusOpx : public popart::popx::ElementWiseUnaryOutplaceOpx {
public:
  TorchSoftplusOpx(popart::Op *op, popart::popx::Devicex *devicex);
};

class TorchSoftplusInplaceOpx
    : public popart::popx::ElementWiseUnaryInplaceOpx {
public:
  TorchSoftplusInplaceOpx(popart::Op *op, popart::popx::Devicex *devicex);
};

class TorchSoftplusGradOpx : public popart::popx::Opx {
public:
  TorchSoftplusGradOpx(popart::Op *op, popart::popx::Devicex *devicex);
  void grow(poplar::program::Sequence &prog) const final;

private:
  float _beta;
  float _threshold;
};

} // namespace poptorch_custom_ops
} // namespace poptorch

#endif


================================================
FILE: popart_compiler/source/custom_operations/UpsampleBilinear2d.cpp
================================================
// Copyright (c) 2021, Graphcore Ltd, All rights reserved.
#include <popart/builder.hpp>
#include <popart/ndarraywrapper.hpp>
#include <popart/op.hpp>
#include <popart/opmanager.hpp>
#include <popart/opserialiser.hpp>
#include <popart/popx/opx.hpp>
#include <popart/popx/opxmanager.hpp>

#include <popops/ElementWise.hpp>
#include <popops/Reduce.hpp>

#include <poputil/TileMapping.hpp>
#include <poputil/Util.hpp>
#include <poputil/VertexTemplates.hpp>

#include "popart_compiler/CodeletsCompilation.hpp"
#include "popart_compiler/CompilerImpl.hpp"
#include "popart_compiler/CustomOps.hpp"

namespace {

struct BilinearParams {
  size_t input0;
  size_t input1;
  float lambda0;
  float lambda1;
};

float areaPixelComputeSourceIndex(float scale, size_t dst_index,
                                  bool align_corners, bool cubic) {
  if (align_corners) {
    return scale * dst_index;
  }
  const float src_idx = static_cast<float>(scale * (dst_index + 0.5) - 0.5);
  // [Note] Follow Opencv resize logic:
  // We allow negative src_idx here and later will use
  //   dx = src_idx - floorf(src_idx)
  // to compute the "distance"(which affects weights).
  // For linear modes, weight distribution doesn't matter
  // for negative indices as they use 2 pixels to interpolate.
  // For example, [-1, 0], they both use pixel 0 value so it
  // doesn't affect if we bound the src_idx to 0 or not.
  // TODO(mihailp): Our current linear mode impls use unbound indices
  // where we should and then remove this cubic flag.
  // This matters in cubic mode, as we might need [-1, 0, 1, 2]
  // to interpolate and the weights can be affected.
  return (!cubic && src_idx < 0) ? 0.0f : src_idx;
}

BilinearParams computeSourceIndexAndLambda(const float scale,
                                           size_t output_index,
                                           size_t input_size,
                                           bool align_corners) {
  if (scale == 1.0) {
    // scale_factor = 1, simply copy
    return {output_index, output_index, 1.0, 0.0};
  }

  const float ratio = align_corners ? static_cast<float>(input_size - 1) /
                                          (scale * input_size - 1.0)
                                    : 1.0f / scale;

  const float real_input_index = areaPixelComputeSourceIndex(
      ratio, output_index, align_corners, /*cubic=*/false);
  const size_t index0 = static_cast<int64_t>(real_input_index);
  const size_t offset = (index0 < input_size - 1) ? 1 : 0;
  const float lambda1 = real_input_index - index0;

  return {index0, index0 + offset, 1.0f - lambda1, lambda1};
}

poplar::VertexRef connectVertex(
    poplar::Graph &graph, poplar::ComputeSet &cs, // NOLINT
    const std::string &vertexName,                // NOLINT
    const std::unordered_map<std::string, poplar::Tensor> &vars,
    const std::unordered_map<std::string, std::vector<poplar::Tensor>> &vectors,
    size_t tile) {
  poplar::VertexRef vtx = graph.addVertex(cs, vertexName);
  for (const auto &p : vars) {
    graph.connect(vtx[p.first], p.second);
  }
  for (const auto &p : vectors) {
    graph.connect(vtx[p.first], p.second);
  }
  graph.setPerfEstimate(vtx, 1);
  graph.setTileMapping(vtx, tile);
  return vtx;
}

using WeightKey = std::tuple<float, float, float, float>;
using WeightMap = std::map<WeightKey, poplar::Tensor>;

struct TileInputs {
  std::vector<poplar::Tensor> i00, i01, i10, i11, output;
  std::vector<std::vector<float>> weights;
  std::vector<poplar::Tensor> weights_t;
};

using MultipleTileMap = std::map<size_t, TileInputs>;

poplar::Tensor bilinearMap(poplar::Graph &graph,            // NOLINT
                           poplar::program::Sequence &prog, // NOLINT
                           const poplar::Tensor &input, float scale_factor,
                           const bool align_corners = false,
                           const poplar::DebugContext &dc = {}) {
  poputil::PoplibsOpDebugInfo const di(dc, DI_ARGS(input, scale_factor));

  const auto input_dims = input.shape();
  assert(input_dims.size() == 4); // NOLINT
  auto output_dims = input_dims;
  output_dims[2] =
      static_cast<std::size_t>(std::floor(output_dims[2] * scale_factor));
  output_dims[3] =
      static_cast<std::size_t>(std::floor(output_dims[3] * scale_factor));
  auto input_shuffled = input.dimShuffle({2, 3, 0, 1})
                            .reshape({input_dims[2], input_dims[3],
                                      input_dims[0] * input_dims[1]});
  std::vector<poplar::Tensor> i00s;

  std::vector<poplar::Tensor> i01s;

  std::vector<poplar::Tensor> i10s;

  std::vector<poplar::Tensor> i11s;
  std::vector<float> w00s;

  std::vector<float> w01s;

  std::vector<float> w10s;

  std::vector<float> w11s;
  for (size_t h = 0; h < output_dims[2]; ++h) {
    const BilinearParams params_h = computeSourceIndexAndLambda(
        scale_factor, h, input_dims[2], align_corners);
    for (size_t w = 0; w < output_dims[3]; ++w) {
      const BilinearParams params_w = computeSourceIndexAndLambda(
          scale_factor, w, input_dims[3], align_corners);
      w00s.push_back(params_h.lambda0 * params_w.lambda0);
      w01s.push_back(params_h.lambda0 * params_w.lambda1);
      w10s.push_back(params_h.lambda1 * params_w.lambda0);
      w11s.push_back(params_h.lambda1 * params_w.lambda1);
      i00s.push_back(input_shuffled[params_h.input0][params_w.input0]);
      i01s.push_back(input_shuffled[params_h.input0][params_w.input1]);
      i10s.push_back(input_shuffled[params_h.input1][params_w.input0]);
      i11s.push_back(input_shuffled[params_h.input1][params_w.input1]);
    }
  }
  poplar::Tensor const i00 = poplar::concat(i00s).reshape(
      {output_dims[2], output_dims[3], output_dims[0], output_dims[1]});
  poplar::Tensor const i01 = poplar::concat(i01s).reshape(
      {output_dims[2], output_dims[3], output_dims[0], output_dims[1]});
  poplar::Tensor const i10 = poplar::concat(i10s).reshape(
      {output_dims[2], output_dims[3], output_dims[0], output_dims[1]});
  poplar::Tensor const i11 = poplar::concat(i11s).reshape(
      {output_dims[2], output_dims[3], output_dims[0], output_dims[1]});
  const poplar::ArrayRef<float> w00_ref{w00s};

  const poplar::ArrayRef<float> w01_ref{w01s};

  const poplar::ArrayRef<float> w10_ref{w10s};

  const poplar::ArrayRef<float> w11_ref{w11s};
  poplar::Tensor const w00 = graph.addConstant(
      input.elementType(), {output_dims[2], output_dims[3], 1, 1}, w00_ref,
      {di, "w00"});
  poputil::mapTensorLinearly(graph, w00);
  poplar::Tensor const w01 = graph.addConstant(
      input.elementType(), {output_dims[2], output_dims[3], 1, 1}, w01_ref,
      {di, "w01"});
  poputil::mapTensorLinearly(graph, w01);
  poplar::Tensor const w10 = graph.addConstant(
      input.elementType(), {output_dims[2], output_dims[3], 1, 1}, w10_ref,
      {di, "w10"});
  poputil::mapTensorLinearly(graph, w10);
  poplar::Tensor const w11 = graph.addConstant(
      input.elementType(), {output_dims[2], output_dims[3], 1, 1}, w11_ref,
      {di, "w11"});
  poputil::mapTensorLinearly(graph, w11);
  poplar::Tensor const output = popops::map(
      graph,
      popops::expr::_1 * popops::expr::_2 +
          popops::expr::_3 * popops::expr::_4 +
          popops::expr::_5 * popops::expr::_6 +
          popops::expr::_7 * popops::expr::_8,
      {i00, w00, i01, w01, i10, w10, i11, w11}, prog, {di, "mapUpsampling"});
  return output.dimShuffle({2, 3, 0, 1});
}

using GradMultipleKey = std::pair<size_t, size_t>;
struct GradMultipleVal {
  float lambda0, lambda1;
  size_t h, w;
};
using GradMultipleMap = std::map<GradMultipleKey, std::vector<GradMultipleVal>>;

GradMultipleMap computeGradMap(size_t in_height, size_t in_width,
                               size_t out_height, size_t out_width,
                               float scale_factor, bool align_corners) {
  GradMultipleMap m;
  for (size_t h = 0; h < in_height; ++h) {
    const BilinearParams params_h =
        computeSourceIndexAndLambda(scale_factor, h, out_height, align_corners);
    for (size_t w = 0; w < in_width; ++w) {
      const BilinearParams params_w = computeSourceIndexAndLambda(
          scale_factor, w, out_width, align_corners);
      m[{params_h.input0, params_w.input0}].push_back(
          GradMultipleVal{params_h.lambda0, params_w.lambda0, h, w});
      m[{params_h.input0, params_w.input1}].push_back(
          GradMultipleVal{params_h.lambda0, params_w.lambda1, h, w});
      m[{params_h.input1, params_w.input0}].push_back(
          GradMultipleVal{params_h.lambda1, params_w.lambda0, h, w});
      m[{params_h.input1, params_w.input1}].push_back(
          GradMultipleVal{params_h.lambda1, params_w.lambda1, h, w});
    }
  }
  return m;
}

std::pair<std::vector<poplar::Tensor>, std::vector<float>>
computeInputsWeights(const std::vector<GradMultipleVal> &vals,
                     const poplar::Tensor &inputTensor) {
  std::vector<poplar::Tensor> inputs;
  std::vector<float> weights;
  size_t prev_w = -1;

  size_t prev_h = -1;
  for (const auto &v : vals) {
    const float weight = v.lambda0 * v.lambda1;
    if (weight > 0.0f) {
      if (v.h == prev_h && v.w == prev_w) {
        weights.back() += weight;
      } else {
        weights.push_back(weight);
        inputs.push_back(inputTensor[v.h][v.w]);
        prev_w = v.w;
        prev_h = v.h;
      }
    }
  }
  return std::make_pair(inputs, weights);
}

void splitIntervalMultiple(
    poplar::Graph &graph, poplar::ComputeSet &cs, // NOLINT
    size_t tile, const std::vector<poplar::Interval> &intervals,
    const poplar::Tensor &input, poplar::Tensor &output, // NOLINT
    const GradMultipleMap &m, const poplar::DebugContext &di) {
  const auto &full_interval = *intervals.begin();
  size_t start_block = full_interval.begin();
  const size_t block_size = output.shape()[2];
  while (start_block < full_interval.end()) {
    const size_t end_block =
        std::min(start_block + block_size - (start_block % block_size),
                 full_interval.end());
    std::vector<std::size_t> start_coords =
        poputil::unflattenIndex(output.shape(), start_block);
    std::vector<std::size_t> end_coords =
        poputil::unflattenIndex(output.shape(), end_block - 1);
    assert(start_coords[0] == end_coords[0]); // NOLINT
    assert(start_coords[1] == end_coords[1]); // NOLINT
    const auto iter = m.find({start_coords[0], start_coords[1]});
    assert(iter != m.end()); // NOLINT
    std::vector<poplar::Tensor> inputs;
    std::vector<float> weights;
    std::tie(inputs, weights) = computeInputsWeights(iter->second, input);
    poplar::Tensor weights_t = graph.addConstant(
        input.elementType(), {weights.size()}, poplar::ArrayRef<float>(weights),
        {di, "upsamplingGradWeights"});
    graph.setTileMapping(weights_t, tile);
    poplar::Tensor const full_input_t =
        poplar::concat(inputs).reshape({inputs.size(), block_size});
    poplar::Tensor const input_t = full_input_t.slice(
        {0, start_coords[2]}, {inputs.size(), end_coords[2] + 1});
    graph.setTileMapping(input_t, tile);
    poplar::Interval const interval{start_block, end_block};
    (void)connectVertex(
        graph, cs,
        poputil::templateVertex("BilinearGradVertex", input.elementType()),
        {{"out", output.flatten().slice(interval)},
         {"w", weights_t},
         {"input", input_t.transpose().flatten()}},
        {}, tile);

    start_block = end_block;
  }
}

void splitInterval(poplar::Graph &graph, poplar::ComputeSet &cs, // NOLINT
                   size_t tile, const std::vector<poplar::Interval> &intervals,
                   const poplar::Tensor &input,
                   poplar::Tensor &output, // NOLINT
                   const GradMultipleMap &m, const poplar::DebugContext &di) {
  const auto regions =
      poputil::splitRegionsBetweenWorkers(graph.getTarget(), intervals, 1);
  const size_t block_size = output.shape()[2];
  const auto &full_interval = *intervals.begin();
  std::vector<std::size_t> start_coords =
      poputil::unflattenIndex(output.shape(), full_interval.begin());
  std::vector<std::size_t> end_coords =
      poputil::unflattenIndex(output.shape(), full_interval.end() - 1);
  assert(start_coords[0] == end_coords[0]); // NOLINT
  assert(start_coords[1] == end_coords[1]); // NOLINT
  const auto iter = m.find({start_coords[0], start_coords[1]});
  assert(iter != m.end()); // NOLINT
  std::vector<poplar::Tensor> inputs;
  std::vector<float> weights;
  std::tie(inputs, weights) = computeInputsWeights(iter->second, input);
  poplar::Tensor weights_t = graph.addConstant(
      input.elementType(), {weights.size()}, poplar::ArrayRef<float>(weights),
      {di, "upsamplingGradWeights"});
  graph.setTileMapping(weights_t, tile);
  poplar::Tensor const full_input_t =
      poplar::concat(inputs).reshape({inputs.size(), block_size});
  for (const auto &r : regions) {
    assert(r.size() == 1); // NOLINT
    const auto &interval = *r.begin();
    start_coords = poputil::unflattenIndex(output.shape(), interval.begin());
    end_coords = poputil::unflattenIndex(output.shape(), interval.end() - 1);
    assert(start_coords[0] == end_coords[0]); // NOLINT
    assert(start_coords[1] == end_coords[1]); // NOLINT
    poplar::Tensor const input_t = full_input_t.slice(
        {0, start_coords[2]}, {inputs.size(), end_coords[2] + 1});
    graph.setTileMapping(input_t, tile);
    (void)connectVertex(
        graph, cs,
        poputil::templateVertex("BilinearGradVertex", input.elementType()),
        {{"out", output.flatten().slice(interval)},
         {"w", weights_t},
         {"input", input_t.transpose().flatten()}},
        {}, tile);
  }
}

void splitIntervalMultiplePixels(poplar::Graph &graph,   // NOLINT
                                 poplar::ComputeSet &cs, // NOLINT
                                 size_t tile,
                                 const std::vector<poplar::Interval> &intervals,
                                 const poplar::Tensor &input,
                                 poplar::Tensor &output, // NOLINT
                                 const GradMultipleMap &m,
                                 const poplar::DebugContext &di) {
  const size_t block_size = output.shape()[2];
  // each pixel is block_size in length
  const auto regions = poputil::splitRegionsBetweenWorkers(
      graph.getTarget(), intervals, block_size);
  for (const auto &r : regions) {
    assert(r.size() == 1); // NOLINT
    const auto &interval = *r.begin();
    assert((interval.size() % block_size) == 0); // NOLINT
    size_t start_block = interval.begin();
    std::vector<poplar::Tensor> full_inputs;
    std::vector<float> full_weights;
    std::vector<uint32_t> limits;
    while (start_block < interval.end()) {
      const size_t end_block = start_block + block_size;
      std::vector<std::size_t> start_coords =
          poputil::unflattenIndex(output.shape(), start_block);
      const std::vector<std::size_t> end_coords =
          poputil::unflattenIndex(output.shape(), end_block - 1);
      assert(start_coords[0] == end_coords[0]); // NOLINT
      assert(start_coords[1] == end_coords[1]); // NOLINT
      const auto iter = m.find({start_coords[0], start_coords[1]});
      assert(iter != m.end()); // NOLINT
      std::vector<poplar::Tensor> inputs;
      std::vector<float> weights;
      std::tie(inputs, weights) = computeInputsWeights(iter->second, input);
      limits.push_back(weights.size());
      std::copy(weights.begin(), weights.end(),
                std::back_inserter(full_weights));
      std::copy(inputs.begin(), inputs.end(), std::back_inserter(full_inputs));
      start_block = end_block;
    }
    poplar::Tensor weights_t = graph.addConstant(
        input.elementType(), {full_weights.size()},
        poplar::ArrayRef<float>(full_weights), {di, "upsamplingGradWeights"});
    graph.setTileMapping(weights_t, tile);
    poplar::Tensor limits_t = graph.addConstant(
        poplar::UNSIGNED_INT, {limits.size()},
        poplar::ArrayRef<unsigned int>(limits), {di, "upsamplingGradLimits"});
    graph.setTileMapping(limits_t, tile);

    poplar::Tensor const full_input_t =
        poplar::concat(full_inputs).reshape({full_inputs.size(), block_size});
    graph.setTileMapping(full_input_t, tile);
    assert(0 == (interval.size() % block_size)); // NOLINT
    (void)connectVertex(graph, cs,
                        poputil::templateVertex("BilinearGradMultipleVertex",
                                                input.elementType()),
                        {{"out", output.flatten().slice(interval)},
                         {"w", weights_t},
                         {"limits", limits_t},
                         {"input", full_input_t.transpose().flatten()}},
                        {}, tile);
  }
}

void processTile(poplar::Graph &graph, poplar::ComputeSet &cs, // NOLINT
                 size_t tile, const std::vector<poplar::Interval> &intervals,
                 const poplar::Tensor &input, poplar::Tensor &output, // NOLINT
                 const GradMultipleMap &m, const poplar::DebugContext &di) {
  assert(intervals.size() == 1); // NOLINT
  const poplar::Interval &interval = *intervals.begin();
  const size_t block_size = output.shape()[2];
  const size_t block_start = interval.begin() - (interval.begin() % block_size);
  const size_t aligned_size = interval.end() - block_start;
  const uint32_t nb_blocks = std::ceil(
      static_cast<float>(aligned_size / static_cast<float>(block_size)));
  if (nb_blocks == 1) {
    splitInterval(graph, cs, tile, intervals, input, output, m, di);
  } else {
    if (nb_blocks <= 6) {
      splitIntervalMultiple(graph, cs, tile, intervals, input, output, m, di);
    } else {
      splitIntervalMultiplePixels(graph, cs, tile, intervals, input, output, m,
                                  di);
    }
  }
}

using Mapping = std::vector<std::vector<poplar::Interval>>;

std::vector<Mapping> splitMapping(const Mapping &m, uint32_t partitions,
                                  uint32_t block_size) {
  if (partitions == 1) {
    return {m};
  }
  std::vector<Mapping> res(partitions);
  for (const auto &m_i : m) {
    const auto regions = poputil::splitRegions(m_i, block_size, partitions);
    for (size_t j = 0; j < regions.size(); ++j) {
      res[j].push_back(regions[j]);
    }
  }
  return res;
}

poplar::Tensor bilinearMapGrads(poplar::Graph &graph,            // NOLINT
                                poplar::program::Sequence &prog, // NOLINT
                                const poplar::Tensor &grad_output,
                                float scale_factor, bool align_corners,
                                uint32_t nb_splits = 0,
                                const poplar::DebugContext &dc = {}) {
  poputil::PoplibsOpDebugInfo const di(dc, DI_ARGS(grad_output, scale_factor));
  const auto grad_output_dims = grad_output.shape();
  assert(grad_output_dims.size() == 4); // NOLINT
  auto grad_input_dims = grad_output_dims;
  grad_input_dims[2] =
      static_cast<std::size_t>(std::floor(grad_output_dims[2] / scale_factor));
  grad_input_dims[3] =
      static_cast<std::size_t>(std::floor(grad_output_dims[3] / scale_factor));
  auto grad_input = graph.addVariable(
      grad_output.elementType(), grad_input_dims,
      {di, "gradientsInput_" + std::to_string(grad_input_dims[2])});
  auto grad_input_shuffled =
      grad_input.dimShuffle({2, 3, 0, 1})
          .reshape({grad_input_dims[2], grad_input_dims[3],
                    grad_input_dims[0] * grad_input_dims[1]});
  size_t grain_size = 1;
  const size_t nb_pixels = grad_input_dims[2] * grad_input_dims[3];
  const size_t num_tiles = graph.getTarget().getNumTiles();
  const size_t num_workers = graph.getTarget().getNumWorkerContexts();
  if (nb_pixels / num_tiles > num_workers) {
    grain_size = grad_output_dims[0] * grad_output_dims[1];
  }
  poputil::mapTensorLinearly(graph, grad_input_shuffled, 1, grain_size);
  auto grad_output_shuffled =
      grad_output.dimShuffle({2, 3, 0, 1})
          .reshape({grad_output_dims[2], grad_output_dims[3],
                    grad_output_dims[0] * grad_output_dims[1]});
  const GradMultipleMap m = computeGradMap(
      grad_output_dims[2], grad_output_dims[3], grad_input_dims[2],
      grad_input_dims[3], scale_factor, align_corners);
  const auto &full_mapping = graph.getTileMapping(grad_input_shuffled);
  if (nb_splits == 0) { // try to guess a good split
    nb_splits = 1;
    const uint32_t blocks_per_tile = std::ceil(static_cast<float>(nb_pixels) /
                                               static_cast<float>(num_tiles));
    if (blocks_per_tile > 6) {
      if (blocks_per_tile <= 12) {
        nb_splits = 2;
      } else {
        if (blocks_per_tile > 12) { // ?
          nb_splits = 3;
        }
      }
    }
  }
  const auto mappings = splitMapping(full_mapping, nb_splits,
                                     grad_output_dims[0] * grad_output_dims[1]);
  for (size_t split = 0; split < mappings.size(); ++split) {
    poplar::ComputeSet compute_set =
        graph.addComputeSet({di, "upsamplingGrad_" + std::to_string(split) +
                                     "_" + std::to_string(grad_input_dims[2])});
    const auto &mapping = mappings[split];
    for (size_t tile = 0; tile < mapping.size(); ++tile) {
      const auto &intervals = mapping[tile];
      if (!intervals.empty()) {
        processTile(graph, compute_set, tile, intervals, grad_output_shuffled,
                    grad_input_shuffled, m, di);
      }
    }
    prog.add(poplar::program::Execute(compute_set, di));
  }
  return grad_input;
}

// For training with a custom Op, four classes need to be implemented,
// one for each of:
// {forward, gradient} x {Op, Opx}.
//
// If only inference is required, then two classes need to be implemented:
// {forward} x {Op, Opx}.
//
// The Op is a poplar/hardware agnostic description of the computation.
// the Opx is the poplar implementation of the Op.
//
// We do training in this example, so the four classes implemented are:
//
class UpsampleOp;
class UpsampleGradOp;
class UpsampleOpx;
class UpsampleGradOpx;

namespace {
// for C++11 compatibility, we don't use std::make_unique
template <typename T, typename... Args>
std::unique_ptr<T> makeUnique(Args &&...args) {
  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
}
} // namespace

// The gradient Op
class UpsampleGradOp : public popart::Op {
public:
  explicit UpsampleGradOp(const UpsampleOp &fwdOp);

  std::unique_ptr<Op> clone() const final {
    return makeUnique<UpsampleGradOp>(*this);
  }

  // The output popart Tensor has the same inputInfo and numerical type
  // (i.e. the same TensorInfo) as the input Tensor. This function is
  // required for inputInfo/type inference
  //
  void setup() final {
    auto input_info = inInfo(0);
    assert(input_info.rank() == 4); // NOLINT
    auto batch_size = input_info.dim(0);
    auto channels = input_info.dim(1);
    auto height = input_info.dim(2);
    auto width = input_info.dim(3);
    const int64_t output_height =
        static_cast<int64_t>(std::floor(height / _scalingFactor));
    const int64_t output_width =
        static_cast<int64_t>(std::floor(width / _scalingFactor));

    outInfo(0).set(input_info.dataType(),
                   {batch_size, channels, output_height, output_width});
  }

  // function describing the inputs and output(s) of UpsampleGradOp
  // The Gradient Op which we are implementing (UpsampleGradOp) has 2 inputs.
  // The input at index 0 is:
  // the gradient of the 0'th output Tensor of the UpsampleOp.
  // The input at index 1 is :
  // the 0'th output Tensor of the UpsampleOp.
  // Supposing the UpsampleOp has input Tensor T0 and output Tensor T1,
  //
  //   input at index 0 (T0)
  //          |
  //        UpsampleOp
  //          |
  //   output at index 0 (T1)
  //
  // Then the picture described by the map below looks like,
  //
  //
  //    input at index 0 (gradient of T1)
  //         |   input at index 1 (T1)
  //         |     |
  //         |     |
  //        UpsampleGradOp
  //            |
  //            |
  //   output at index 0 (gradient of T0)
  //
  const std::vector<popart::GradInOutMapper> &gradInputInfo() const override {
    static const std::vector<popart::GradInOutMapper> in_info = {
        {0, 0, popart::GradOpInType::GradOut},
        {1, 0, popart::GradOpInType::Out}};
    return in_info;
  }

  // The Grad Op only has one output, at index 0. The output at index 0
  // is the gradient of the input at index 0 of the UpsampleOp
  const std::map<int, int> &gradOutToNonGradIn() const override {
    static const std::map<int, int> out_info = {{0, 0}};
    return out_info;
  }

  // an estimate of how valuable sub-graph matching will be
  float getSubgraphValue() const final { return getLowSubgraphValue(); }

  float getScalingFactor() const { return _scalingFactor; }
  bool getAlignCorners() const { return _alignCorners; }
  // Implementation defined below
  void appendAttributes(popart::OpSerialiserBase &os) const override;

  // Implementation defined below
  void appendOutlineAttributes(popart::OpSerialiserBase &os) const override;

private:
  float _scalingFactor;
  bool _alignCorners;
};

// The forward Op
class UpsampleOp : public popart::Op {
public:
  UpsampleOp(const popart::OperatorIdentifier &_opid, float scalingFactor,
             bool alignCorners, const popart::Op::Settings &settings_)
      : popart::Op(_opid, settings_), _scalingFactor{scalingFactor},
        _alignCorners(alignCorners) {}

  // same comment as for UpsampleGradOp, for running shape/type inference
  // "statically"
  void setup() override {
    auto input_info = inInfo(0);
    assert(input_info.rank() == 4); // NOLINT
    auto batch_size = input_info.dim(0);
    auto channels = input_info.dim(1);
    auto height = input_info.dim(2);
    auto width = input_info.dim(3);
    const int64_t output_height =
        static_cast<int64_t>(std::floor(height * _scalingFactor));
    const int64_t output_width =
        static_cast<int64_t>(std::floor(width * _scalingFactor));

    outInfo(0).set(input_info.dataType(),
                   {batch_size, channels, output_height, output_width});
  }

  std::unique_ptr<Op> clone() const final {
    return makeUnique<UpsampleOp>(*this);
  }

  // There is only one Gradient Op for UpsampleOp, a UpsampleGradOp
  // It is possible to have multiple Gradient Ops
  // (Conv has 2 in popart, one for weights and one for activations)
  //
  std::vector<std::unique_ptr<popart::Op>> getGradOps() override {
    std::vector<std::unique_ptr<Op>> upops;        // NOLINT
    upops.emplace_back(new UpsampleGradOp(*this)); // NOLINT
    return upops;
  }
  void appendAttributes(popart::OpSerialiserBase &os) const override {
    Op::appendAttributes(os);
    os.appendAttribute("scaling_factor", getScalingFactor());
    os.appendAttribute("align_corners", getAlignCorners());
  }

  void appendOutlineAttributes(popart::OpSerialiserBase &os) const override {
    Op::appendOutlineAttributes(os);
    os.appendAttribute("scaling_factor", getScalingFactor());
    os.appendAttribute("align_corners", getAlignCorners());
  }

  // an estimate of how valuable sub-graph matching will be
  float getSubgraphValue() const final { return getLowSubgraphValue(); }

  float getScalingFactor() const { return _scalingFactor; }
  bool getAlignCorners() const { return _alignCorners; }

private:
  float _scalingFactor;
  bool _alignCorners;
};

// describe the inputs and outputs that are supported by the operation
popart::OpDefinition::DataTypes t = {popart::DataType::FLOAT16,
                                     popart::DataType::FLOAT};

popart::OpDefinition upsample_op_def(
    {popart::OpDefinition::Inputs({{"input", t}}),
     popart::OpDefinition::Outputs({{"output", t}}),
     popart::OpDefinition::Attributes({{"scaling_factor", {"*"}},
                                       {"align_corners", {"*"}}})});

popart::OpCreator<UpsampleOp> upsample_op_creator(
    popart::OpDefinitions({{poptorch::poptorch_custom_ops::upsample_bilinear2d,
                            upsample_op_def}}),
    [](const popart::OpCreatorInfo &info) {
      // default scalingFactor is 2.0
      float const scaling_factor =
          info.attributes.getAttribute<popart::Attributes::Float>(
              "scaling_factor", 2.0f);
      int const align_corners =
          info.attributes.getAttribute<popart::Attributes::Int>("align_corners",
                                                                0);
      return std::make_unique<UpsampleOp>(info.opid, scaling_factor,
                                          align_corners, info.settings);
    },
    true);

// forward Opx (poplar implementation of the forward Op)
class UpsampleOpx : public popart::popx::Opx {
public:
  UpsampleOpx(popart::Op *op, popart::popx::Devicex *devicex)
      : popart::popx::Opx(op, devicex) {
    // not strictly necessary, we check that op is castable to a UpsampleOp *.
    verifyOp<UpsampleOp>(op,
                         poptorch::poptorch_custom_ops::upsample_bilinear2d);

    // Get around the ABI issues.
    auto managed_ptr = poptorch::popart_compiler::compileCustomCodeletIfNeeded(
        "UpsampleBilinear2dCodelets.inc.cpp", /*hw_only_codelet=*/false);
    const char *compiled_codelet_path =
        static_cast<const char *>(managed_ptr.get());
    graph().addCodelets(std::string(compiled_codelet_path));
  }

  void grow(poplar::program::Sequence &prog) const final {
    // Upsample the input. We create a poplar::Tensor of name outId(0)
    std::cerr << "Debug UpsampleOpx::grow\n";
    auto op = getOp<UpsampleOp>();
    const float scaling_factor = op.getScalingFactor();
    const bool align_corners = op.getAlignCorners();
    auto input = getInTensor(0);

    setOutTensor(
        0, bilinearMap(graph(), prog, input, scaling_factor, align_corners));
  }
};

// backward Opx (poplar implementation of the backward Op)
class UpsampleGradOpx : public popart::popx::Opx {
public:
  UpsampleGradOpx(popart::Op *op, popart::popx::Devicex *devicex)
      : popart::popx::Opx(op, devicex) {
    verifyOp<UpsampleGradOp>(
        op, poptorch::poptorch_custom_ops::upsample_bilinear2d_grad);
  }

  // Create the gradient poplar::Tensor, which is
  // 3 * input_to_upsample**2 * gradient_of_upsample_output
  void grow(poplar::program::Sequence &prog) const final {
    std::cerr << "Debug UpsampleGradOpx::grow\n";
    auto fwd_input = getInTensor(0);
    auto grad_out = getInTensor(1);

    auto op = getOp<UpsampleGradOp>();
    const float scaling_factor = op.getScalingFactor();
    const bool align_corners = op.getAlignCorners();
    setOutTensor(0, bilinearMapGrads(graph(), prog, grad_out, scaling_factor,
                                     align_corners));
  }
};

UpsampleGradOp::UpsampleGradOp(const UpsampleOp &fwdOp)
    : popart::Op(poptorch::poptorch_custom_ops::upsample_bilinear2d_grad,
                 fwdOp.settings),
      _scalingFactor{fwdOp.getScalingFactor()}, _alignCorners{
                                                    fwdOp.getAlignCorners()} {}

void UpsampleGradOp::appendAttributes(popart::OpSerialiserBase &os) const {
  Op::appendAttributes(os);
  os.appendAttribute("scaling_factor", getScalingFactor());
  os.appendAttribute("align_corners", getAlignCorners());
}

void UpsampleGradOp::appendOutlineAttributes(
    popart::OpSerialiserBase &os) const {
  Op::appendOutlineAttributes(os);
  os.appendAttribute("scaling_factor", getScalingFactor());
  os.appendAttribute("align_corners", getAlignCorners());
}

popart::popx::OpxCreator<UpsampleOpx>
    upsample_opx_creator(poptorch::poptorch_custom_ops::upsample_bilinear2d);
popart::popx::OpxCreator<UpsampleGradOpx> upsample_grad_opx_creator(
    poptorch::poptorch_custom_ops::upsample_bilinear2d_grad);

} // namespace


================================================
FILE: popart_compiler/source/custom_operations/UpsampleBilinear2dCodelets.inc.cpp
================================================
// Copyright (c) 2021, Graphcore Ltd, All rights reserved.
#include <algorithm>
#include <cassert>
#include <cmath>
#include <limits>

#ifdef __IPU__
#include <ipu_vector_math>
#endif

#include <poplar/HalfFloat.hpp>
#include <poplar/Vertex.hpp>

static constexpr auto ONE_PTR = poplar::VectorLayout::ONE_PTR;

template <typename T> class BilinearMultipleVertex : public poplar::Vertex {
public:
  poplar::Input<poplar::Vector<T, ONE_PTR>> inputs;
  poplar::Output<poplar::Vector<T>> out;
  poplar::Input<poplar::Vector<T, ONE_PTR>> w;

  bool compute() {
    unsigned int offset = 0;
    for (unsigned int i = 0; i < out.size(); ++i) {
      out[i] = inputs[offset] * w[0] + inputs[offset + 1] * w[1] +
               inputs[offset + 2] * w[2] + inputs[offset + 3] * w[3];
      offset += 4;
    }
    return true;
  }
};

template class BilinearMultipleVertex<float>;
template class BilinearMultipleVertex<half>;

template <typename T> class BilinearGradVertex : public poplar::Vertex {
public:
  poplar::Input<poplar::Vector<T>> input;
  poplar::Input<poplar::Vector<T>> w;
  poplar::Output<poplar::Vector<T>> out;

  bool compute() {
    unsigned int offset = 0;
    for (unsigned int i = 0; i < out.size(); ++i) { // b x c
      float res = 0.0f;
      for (unsigned int j = 0; j < w.size(); ++j) {
        res += float(input[offset + j] * w[j]);
      }
      out[i] = res;
      offset += w.size();
    }
    return true;
  }
};

template class BilinearGradVertex<float>;
template class BilinearGradVertex<half>;

template <typename T> class BilinearGradMultipleVertex : public poplar::Vertex {
public:
  poplar::Input<poplar::Vector<T>> input;
  poplar::Input<poplar::Vector<T>> w;
  poplar::Input<poplar::Vector<unsigned int>> limits;
  poplar::Output<poplar::Vector<T>> out;

  bool compute() {
    unsigned int offset = 0;
    const size_t block_size = out.size() / limits.size();
    for (unsigned int i = 0; i < block_size; ++i) { // b x c
      unsigned int w_offset = 0;
      unsigned int pixel = 0;
      for (unsigned int limit : limits) {
        float res = 0.0f;
        for (unsigned int j = 0; j < limit; ++j) {
          res += float(input[offset + j] * w[w_offset + j]);
        }
        out[pixel * block_size + i] = res;
        offset += limit;
        w_offset += limit;
        ++pixel;
      }
    }
    return true;
  }
};

template class BilinearGradMultipleVertex<float>;
template class BilinearGradMultipleVertex<half>;


================================================
FILE: popart_compiler/source/include/popart_compiler/CompilerImpl.hpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#pragma once

#include <algorithm>
#include <atomic>
#include <list>
#include <map>
#include <memory>
#include <set>
#include <stack>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>

#include <vector>

#include <popart/builder.hpp>
#include <popart/iarray.hpp>
#include <popart/istepio.hpp>
#include <popart/session.hpp>
#include <popart/stepio.hpp>
#include <popart/voiddata.hpp>
#include <poplar/Tensor.hpp>

#include "popart_compiler/Compiler.hpp"
#include "popart_compiler/CompilerOptions.hpp"
#include "popart_compiler/MultiConvBuilder.hpp"
#include "poptorch_logging/Logging.hpp"

namespace poptorch {
namespace popart_compiler {

class Compiler;

namespace detail {

/*
  We use this structure to maintain all the information related to a CPU
  callback. This is used by the custom op to create the poplar tensors and by
  the compiler to create the poplar callbacks.
*/
struct CallbackInternalMetadata {
  // We need a unique ID for each so we can track how many we've added.
  static std::uint32_t number_of_added_ops;

  // The thing we are calling back.
  std::function<void()> the_callback;

  // Pointers to the buffers on host.
  std::vector<void *> input_pointers;
  std::vector<void *> output_pointers;

  // The names of the operation which we give on creation. The custom op needs
  // to see these to create the operation and the compiler needs it to attach
  // the callbacks.
  std::string handle;

  // Type and shape info for the input and outputs.
  std::vector<PopartType> input_types;
  std::vector<std::vector<std::size_t>> input_shapes;
  std::vector<PopartType> output_types;
  std::vector<std::vector<std::size_t>> output_shapes;

  // The callbacks are called in random order so we need to track how many have
  // copied their data to make sure we only call the host function once all of
  // them have copied it.
  std::atomic<std::uint32_t> number_of_input_streams_inited;
};

class StepIO : public popart::IStepIO {
public:
  struct ArrayInfo {
    popart::IArray &array;
    int64_t offset;
    int64_t end_offset;
    int64_t replica_idx;
  };

  using ArrayType = popart::IArray;
  using AccessorType = popart::StepIONS::IArrayAccessor;
  using TensorArrayMap = std::map<popart::TensorId, ArrayType &>;
  using TensorTimestamps = std::map<popart::TensorId, std::vector<double>>;
  using TensorArrayInfo = std::map<popart::TensorId, ArrayInfo>;
  using TensorStepDataInfo = std::map<popart::TensorId, popart::TensorInfo>;

  StepIO() = default;

  popart::ConstVoidData in(popart::TensorId id, int64_t num_elems, // NOLINT
                           bool prefetch, bool /*isBroadcast*/) override;
  void inComplete(popart::TensorId id, int64_t num_elems,
                  bool) override; // NOLINT
  popart::MutableVoidData out(popart::TensorId id,
                              int64_t num_elems) override; // NOLINT
  void outComplete(popart::TensorId id) override;          // NOLINT

  void computeStepDataInfo(const popart::TensorId &id, popart::IArray *array);
  void populate(const TensorArrayMap &inputs, const TensorArrayMap &outputs);

  template <typename T>
  T get(const popart::TensorId &id, TensorArrayInfo *map, int64_t num_elems,
        bool is_input);
  static void timestamp(TensorTimestamps *time, const popart::TensorId &id);

  void assertNumElements(
      const popart::popx::Executablex & /*unused*/) const override {}

  const std::vector<double> &
  getInputTimestamps(const popart::TensorId &id) const {
    return _in_times.at(id);
  }

  const std::vector<double> &
  getInputCompleteTimestamps(const popart::TensorId &id) const {
    return _in_complete_times.at(id);
  }

  const std::vector<double> &
  getOutputTimestamps(const popart::TensorId &id) const {
    return _out_times.at(id);
  }

  const std::vector<double> &
  getOutputCompleteTimestamps(const popart::TensorId &id) const {
    return _out_complete_times.at(id);
  }

  void setInputGroupings(popart::CommGroupType type, int64_t input_group_size,
                         int64_t replica_count);

protected:
  TensorArrayInfo _inputs_info;
  TensorArrayInfo _outputs_info;
  TensorStepDataInfo _step_data_info;

  TensorTimestamps _in_times;
  TensorTimestamps _in_complete_times;
  TensorTimestamps _out_times;
  TensorTimestamps _out_complete_times;

  popart::CommGroupType _input_cgt;
  int64_t _input_group_size;
  int64_t _replica_count;
};

class WeightsIO : public popart::IWeightsIO {
public:
  ~WeightsIO() override = default;
  bool contains(popart::TensorId id) const final;
  popart::MutableVoidData weight(popart::TensorId id) const final;
  void registerParameter(const popart::TensorId &id,
                         const popart::TensorInfo &info);
  void updateData(const std::vector<void *> &host_buffers);
  const std::vector<popart::TensorId> &parameterIds() const;

private:
  std::map<popart::TensorId, popart::MutableVoidData> _weights;
  std::vector<popart::TensorId> _weights_order;
};

// Compare a ConstVoidData based on type, shape, and data
struct ConstVoidDataLessThan {
  bool operator()(const popart::ConstVoidData &lhs,
                  const popart::ConstVoidData &rhs) const;
};

struct CompilerImpl {
public:
  friend Compiler;

  CompilerImpl() : op_builder(popart::Builder::create()) {
    ids.emplace_back(""); // None tensor
    ids_types.push_back(PopartType::UNDEFINED);
    active_builder = op_builder.get();
    using_overlapped_io = false;
  }
  ~CompilerImpl();

  std::unique_ptr<popart::Builder> op_builder;

  // Op_builder is the top level graph. However to support subgraphs we switch
  // between adding ops from each of these subgraphs. All subgraphs are children
  // of the op_builder top level graph.
  popart::Builder *active_builder;

  // Stacks for subgraphs realizing true/false branch paths.
  std::stack<popart::Builder *> if_true_stack;
  std::stack<popart::Builder *> if_false_stack;

  std::map<popart::TensorId, popart::AnchorReturnType> anchors;

  std::vector<popart::TensorId> ids;
  std::vector<PopartType> ids_types;

  // Input tensors to the session.
  std::map<popart::TensorId, popart::IArray &> popart_incoming;

  // Output tensors for the session.
  std::map<popart::TensorId, popart::IArray &> popart_outgoing;
  std::map<popart::TensorId, std::vector<void *>> outgoing_duplicates;

  std::vector<popart::TensorId> inputs;
  std::vector<popart::TensorId> outputs;
  // Flat representation of the output shapes
  std::vector<OutputTypeShape> output_types;

  // A list to allocate our buffers in so they get released.
  std::list<std::unique_ptr<popart::IArray>> memory_manager;

  std::unique_ptr<popart::Session> session;

  StepIO stepio;
  WeightsIO weights;
  WeightsIO updatable_named_buffers;
  WeightsIO optim_state_tensors;

  bool is_training = false;

  // At least one use of overlapped host IO
  bool using_overlapped_io = false;

  // Record the final loss, it is guaranteed by previous passes to be just one
  // loss.
  popart::TensorId loss;

  // List of options which have been explicitely set by the user.
  std::set<std::string> options_set;

  popart::SessionOptions popart_options{};

  CompilerOptions options{};

  // We add operations using a state based system so the user would set the
  // active IPU and all subsequent operations will be added to that IPU until
  // stopped.
  // By default, the active IPU is 0 in case setActiveIpu is never used.
  // However, clearActiveIpu will set it to -1 making future use of
  // setActiveIpu compulsory.
  std::int64_t active_ipu{0};
  std::uint64_t active_stage{0};
  std::int64_t active_phase{0};
  // Keep track of what the maximum phase number used is.
  std::int64_t max_phase{0};

  // Number of ipus used (set by createDevice())
  std::uint64_t num_ipus{0};

  // Which IPUs are being used
  // Note that this does not take into account replication and so the number of
  // IPUs actually used is multiplied by popart_options.replicatedGraphCount.
  // Due to rounding and the issues with skipping an IPU in a range, the number
  // of IPUs required may increase further.
  std::unordered_set<std::uint64_t> used_ipus;

  // Keep the number of ipu switches to work out the number of pipeline stages
  // if relevant.
  std::uint64_t num_ipu_switches{0};

  // Store the last ipu used: this will always match active_ipu unless
  // active_ipu is set to -1.
  std::uint64_t last_ipu_used{0};

  // Map of the pytorch variable update group to the popart weight.
  std::map<std::uint64_t, std::vector<popart::TensorId>> grad_update_groups;

  std::unique_ptr<MultiConvBuilder> multi_conv_builder;

  // Dynamic container for all the callbacks to live in.
  std::list<CallbackInternalMetadata> callbacks;

  // Returns the number of pipeline stages in the model execution
  std::uint64_t numPipelineStages();

  popart::SourceLocation code_location;
  std::string torch_node;

  // General helpers.

  // Inserts memory into the list of tensors being output by the model.
  void addMemoryToOutput(TensorId id, void *ptr,
                         std::unique_ptr<popart::IArray> &&memory);

  // Domain helpers
  popart::TensorId reshape(const std::vector<popart::TensorId> &tensors,
                           const std::vector<int64_t> &shape);

  void addOutputTensor(const std::vector<popart::TensorId> &tensors);

  popart::TensorId
  addUntypedInputTensor(const std::vector<popart::TensorId> &tensors);

  std::vector<popart::TensorId> customOperation(
      const std::vector<popart::TensorId> &args, const std::string &op,
      const std::string &domain, std::int64_t version, std::int64_t num_outputs,
      const std::shared_ptr<std::vector<PopartAttribute>> &attributes);

  popart::TensorId
  recomputationCheckpoint(const std::vector<popart::TensorId> &tensors);

  popart::TensorId tensorConstant(const std::vector<popart::TensorId> &tensors,
                                  const PopartConstant &constant);

  TensorId hostSideTensorConstant(const std::vector<popart::TensorId> &tensors,
                                  HostSideConstant constant);

  popart::TensorId addNotInPlace(const std::vector<popart::TensorId> &in);

  popart::TensorId randomNormal(const std::vector<popart::TensorId> &tensors,
                                const std::vector<int64_t> &shape, float mean,
                                float scale, const std::string &dtype);

  popart::TensorId randomUniform(const std::vector<popart::TensorId> &tensors,
                                 const std::vector<int64_t> &shape, float high,
                                 float low, const std::string &dtype);

  popart::TensorId ones(const std::vector<popart::TensorId> &tensors,
                        const std::vector<int64_t> &shape,
                        const std::string &dtype);

  popart::TensorId zeros(const std::vector<popart::TensorId> &tensors,
                         const std::vector<int64_t> &shape,
                         const std::string &dtype);

  popart::TensorId zerosOrOnes(const std::vector<popart::TensorId> &tensors,
                               const std::vector<int64_t> &shape,
                               const std::string &dtype, bool zeros);

  popart::TensorId unfold(const std::vector<popart::TensorId> &tensors,
                          int64_t dimension, int64_t size, int64_t step);

  popart::TensorId prelu(std::vector<popart::TensorId> &tensors);

  void addMultiConvPart(const std::vector<popart::TensorId> &tensors,
                        const std::vector<int64_t> &dilations,
                        const std::vector<int64_t> &kernel_shape,
                        const std::vector<int64_t> &pads,
                        const std::vector<int64_t> &strides);

  std::vector<popart::TensorId> endMultiConv();

  void optimizerGroup(const std::vector<TensorId> &tensors, int64_t group) {
    std::vector<popart::TensorId> ins;
    std::transform(tensors.begin(), tensors.end(), std::back_inserter(ins),
                   [&](TensorId index) { return ids[index]; });

    grad_update_groups.insert({group, ins});
  }

  std::unique_ptr<popart::Optimizer>
  getPopartOptimizer(std::vector<Optimizer> optimizers);

  void updateUseModelConfig();
  std::string checkSystemConfig() const;
  template <typename T, typename U>
  void setOptionIfNotSet(T &option, U value, const std::string &name,
                         const std::string &value_as_string) {
    if (options_set.count(name) && option != static_cast<T>(value)) {
      logging::warn("{} forced by the user from default to {}, "
                    "ignoring value {}",
                    name, option, value_as_string);
    } else {
      logging::debug("{} set to value {}", name, value_as_string);
      option = value;
    }
  }

  template <typename T, typename U>
  void setOptionIfNotSet(T &option, U value, const std::string &name) {
    setOptionIfNotSet(option, value, name, std::to_string(value));
  }

  void
  setExecutionStrategyAttributes(const std::set<popart::TensorId> &tensors);

  const HostSideConstant &getHostSideConstant(TensorId id) const;

  bool isHostSideConstant(TensorId id) const;

  /* must_attach is a special case for on_demand devices */
  std::shared_ptr<popart::DeviceInfo> createDevice(bool must_attach = false);
  bool waitIfUnavailable() const;
  void attachToDevice();
  void detachFromDevice();
  bool isAttachedToDevice() const;

  template <typename OptimizerType>
  void updateGroups(OptimizerType *optimizer,
                    const std::vector<Optimizer> &optimizers);
  std::string getPopartIR() const;

  std::set<popart::TensorId> getTensorNames() const;

  // Returns the PopART type for specified id
  PopartType getPopartType(TensorId id) const;

  // Caches all PopART types
  void cachePopartTypes();

  // Returns cached PopART type for the specified id
  // Caution: no bounds checking as this is called for each input, each run.
  // cachePopartType must be called once first.
  PopartType getCachedPopartType(TensorId id) const { return ids_types[id]; }

  void setAttribute(const std::string &attribute, const std::string &key,
                    const std::string &value);
  void clearAttribute(const std::string &attribute, const std::string &key);

  popart::DebugContext getDebugContext(const std::string &name);

  // Mark named buffer as updatable
  void registerUpdatableNamedBuffer(const TensorId &id,
                                    const popart::TensorInfo &info);

private:
  // Raise an error if cycle logging is enabled
  void errorOnCycleLogging() const;

  // Keep all the PopART tensors in a cache to avoid adding duplicate constants,
  // wasting tile memory. This must also be mapped by builder as constants
  // exist only in one graph.
  std::map<popart::Builder *, std::map<popart::ConstVoidData, popart::TensorId,
                                       ConstVoidDataLessThan>>
      _constants_cache;
  std::vector<std::unique_ptr<char[]>> _constant_cloned_data;

  // Constants which are simply returned (possibly as part of a tuple/list) and
  // do not need to be input into Popart
  std::unordered_map<TensorId, HostSideConstant> _host_side_constants;
  std::shared_ptr<popart::DeviceInfo> _device;

  std::unordered_map<std::string, std::map<std::string, std::string>>
      _attribute_key_value_map;
};

} // namespace detail

popart::DataType popartTypeFromPoptorch(PopartType);

poplar::Type poplarTypeFromPoptorch(PopartType);

} // namespace popart_compiler
} // namespace poptorch


================================================
FILE: popart_compiler/source/include/popart_compiler/CompilerOptions.hpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#pragma once

#include <cstdint>
#include <limits>
#include <string>
#include <unordered_map>

#include <popart/commgroup.hpp>
#include <popart/patterns/patterns.hpp>
#include <popart/popx/devicexmanager.hpp>

#include "popart_compiler/PopartEnums.hpp"

namespace poptorch {
namespace popart_compiler {
namespace detail {

enum class ExecutionMode { Pipelined, Sharded, Phased, N };

// To be kept in sync with the Liveness python enum in python/enums.py
enum class Liveness {
  AlwaysLive,
  OffChipAfterFwd,
  OffChipAfterFwdNoOverlap,
  OffChipAfterEachPhase,
  N
};

struct CompilerOptions {
  // A constant to tell the copmiler to use the system ipu version
  constexpr static std::uint64_t use_system_ipu_version =
      std::numeric_limits<std::uint64_t>::max();

  // Make PopART save the initializers in a separate file.
  // (Needed to keep the ONNX protobuf below the 2GB limit when compiling
  // large models)
  std::string external_initializers_file;
  // Number of times the graph will be executed for each execution.
  std::uint64_t steps{0};
  // Strategy to adopt for returning the graph's output tensors.
  PopartOutputMode output_mode;
  // 'N' when output_mode == PopartOutputMode::EveryN
  std::uint64_t output_return_period;
  // True if running on the model, False otherwise.
  bool ipu_model{false};
  // Automatically round up the number of IPUs, if required, to the minimum
  // number required to be reserved
  bool auto_round_num_ipus{false};
  // Only used for offline compilation (DeviceConnectionType.Never): version
  // of the IPU should the Poplar compiler be targeting.
  std::uint64_t ipu_version{use_system_ipu_version};
  // ID of the specific IPU the user wants to use. (If not set we'll just
  // iterate over the IPUs present on the system and try to connect to one
  // that matches our requirements).
  std::uint64_t ipu_id{0};
  popart::DeviceConnectionType connection_type;
  popart::SyncPattern sync_pattern;
  std::uint64_t random_seed{0};

  // The frontend will unpack the user option and pass it directly in as
  // [IPU_ID] = Memory proportion for that IPU
  std::unordered_map<std::uint32_t, float> available_memory_proportion;

  // When running in distributed mode: number of processes the training is
  // split// over.
  std::uint64_t num_distributed_processes{1};
  // In distributed mode: unique ID of this process in [0,
  // num_distributed_processes]// range
  std::uint64_t distributed_process_id{0};

  popart::Patterns patterns{popart::PatternsLevel::Default};
  ExecutionMode execution_mode{};

  // Phased execution options: see the python documentation for more
  // information about how to use them
  //
  // Here is how they translate into Popart options:
  // serial_phases_execution: True -> executionPhaseSettings.stages = 1
  //                          False-> executionPhaseSettings.stages = 2
  //
  // separate_backward_phase:
  //  False:
  //   fwd:       bwd:
  //   phase 0 -> phase 4
  //   phase 1 -> phase 3
  //   phase 2 -> phase 2
  //
  // (End of fwd and start of bwd are part of the same phase)
  //  True:
  //   fwd:       bwd:
  //   phase 0 -> phase 6
  //   phase 1 -> phase 5
  //   phase 2 -> phase 4
  //
  //  This is done by setting options.executionPhaseSettings.phases to N+1
  //
  //  Note that the bwd phases begin with phase 4 and not phase 3. This is
  //  because PopART requires the phase IDs of a fwd/bwd pair to have matching
  //  parity. Since the fwd phase ID is 2, the next phase ID with even parity
  //  is 4.
  //
  //  Furthermore, all odd phases must run on the same IPUs, and all even
  //  phases must also run on the same IPUs.
  //
  // tensors_liveness:
  //  Note: tensors have a liveness of [phase, phase+2]
  //  AlwaysLive:
  //   fwd:       bwd:
  //   phase 0 -> phase 6
  //   phase 1 -> phase 5
  //   phase 2 -> phase 4
  // Stride = 1
  //
  //  OffChipAfterFwd:
  //   fwd:       bwd:
  //   phase 0 -> phase 8
  //   phase 1 -> phase 7
  //   phase 2 -> phase 6
  // Stride = 1
  // (Gap between fwd and bwd > 2)
  //  This is done by incrementing options.executionPhaseSettings.phases by 3
  //
  //  OffChipAfterFwdNoOverlap:
  //   fwd:       bwd:
  //   phase 0 -> phase 12
  //   phase 2 -> phase 10
  //   phase 4 -> phase 8
  // Stride = 2
  // (Gap between fwd and bwd > 2, with no overlapping of load/store)
  //  This is done by incrementing options.executionPhaseSettings.phases by 3
  //  and multiplying the phase_id by 2.
  //
  //  OffChipAfterEachPhase: (Only for stage=1)
  //   fwd:       bwd:
  //   phase 0 -> phase 20
  //   phase 4 -> phase 16
  //   phase 8 -> phase 12
  // Stride = 4
  // (Gap between each phase > 2)
  // This is done by incrementing options.executionPhaseSettings.phases by 3
  // and multiplying the phase_id by 4.
  bool serial_phases_execution{false};
  bool separate_backward_phase{false};
  Liveness tensors_liveness{};

  // Debug name for the model
  std::string model_name;

  // (Not yet supported) Whether each buffer should be broadcasted from the
  // first to other replicas on each training step.
  bool broadcast_buffers{false};

  std::int64_t input_group_size;
  popart::CommGroupType input_cgt;
};

} // namespace detail
} // namespace popart_compiler
} // namespace poptorch


================================================
FILE: popart_compiler/source/include/popart_compiler/CustomOps.hpp
================================================
// Copyright (c) 2021 Graphcore Ltd. All rights reserved.

/*
 * Host op represents an operation executed on the CPU. It is offloaded by
 * writing the tensors from IPU into host buffers. Triggering the operation.
 * Then writing back to IPU tensors.
 */

#include <cstdint>
#include <popart/op.hpp>

extern "C" {

namespace poptorch {
namespace poptorch_custom_ops {

constexpr std::uint32_t domain = 1;

// The number of input tensors we can consume (between MIN_INPUTS and
// MAX_INPUTS).
constexpr std::uint32_t min_inputs = 0;
constexpr std::uint32_t max_inputs = 64;

extern const char host_op_metadata_attr[];
const popart::OperatorIdentifier host_op = {"poptorch.custom_ops",
                                            "HostOp",
                                            domain,
                                            {min_inputs, max_inputs}}; // NOLINT
const popart::OperatorIdentifier upsample_bilinear2d = {
    "poptorch.custom_ops", "UpsampleBilinear2d", 1};
const popart::OperatorIdentifier upsample_bilinear2d_grad = {
    "poptorch.custom_ops", "UpsampleBilinear2dGrad", 1};

const popart::OperatorIdentifier torch_softplus = {
    "poptorch.custom_ops", "TorchSoftplus", 1, {1}, 1};

const popart::OperatorIdentifier torch_softplus_inplace = {
    "poptorch.custom_ops", "TorchSoftplusInplace", 1, {1}, 1};

const popart::OperatorIdentifier torch_softplus_grad = {
    "poptorch.custom_ops", "TorchSoftplusGrad", 1, {1}, 1};

const popart::OperatorIdentifier embedding = {"poptorch.custom_ops",
                                              "Embedding", domain};

const popart::OperatorIdentifier embedding_grad = {"poptorch.custom_ops",
                                                   "EmbeddingGrad", domain};

const popart::OperatorIdentifier fast_gather_last_dim = {
    "poptorch.custom_ops", "FastGatherLastDim", 1};

const popart::OperatorIdentifier fast_gather_last_dim_grad = {
    "poptorch.custom_ops", "FastGatherLastDimGrad", 1};

} // namespace poptorch_custom_ops
} // namespace poptorch
}


================================================
FILE: popart_compiler/source/include/popart_compiler/MultiConvBuilder.hpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#pragma once

#include <cstdint>
#include <string>
#include <vector>

#include <popart/builder.hpp>
#include <popart/op/convbase.hpp>
#include <popart/tensors.hpp>

#include "poptorch_logging/Error.hpp"

namespace poptorch {
namespace popart_compiler {
namespace detail {

class MultiConvBuilder {
public:
  void addConv(const std::vector<popart::TensorId> &inputs,
               const std::vector<int64_t> &dilations,
               const std::vector<int64_t> &kernel_shape,
               const std::vector<int64_t> &pads,
               const std::vector<int64_t> &strides) {
    // Record the inputs and attributes for this single conv
    _inputs.push_back(inputs);
    _dilations.push_back(dilations);
    _kernel_shape.push_back(kernel_shape);
    _pads.push_back(pads);
    _strides.push_back(strides);
  }

  void setAvailableMemoryProportions(const std::vector<float> &v) {
    _options.availableMemoryProportions = v;
  }

  void setPartialsTypes(const std::vector<int64_t> &partials_types) {
    std::vector<std::string> type_strs;

    for (int64_t t : partials_types) {
      if (t == 0) {
        type_strs.emplace_back("float");
      } else if (t == 1) {
        type_strs.emplace_back("half");
      } else {
        ERROR("Invalid MultiConv partials_types");
      }
    }

    _options.partialsTypes = type_strs;
  }

  void setEnableConvDithering(const std::vector<int64_t> &v) {
    _options.enableConvDithering = v;
  }

  void setPlanType(int64_t plan_type) {
    if (plan_type == 0) {
      _options.planType = "parallel";
    } else if (plan_type == 1) {
      _options.planType = "serial";
    } else {
      ERROR("Invalid MultiConv plan_type");
    }
  }

  void setPerConvReservedTiles(int n) { _options.perConvReservedTiles = n; }

  void setCycleBackOff(float v) { _options.cycleBackOff = v; }

  std::vector<popart::TensorId> build(popart::Builder *builder) const {
    auto opset = builder->aiGraphcoreOpset1();
    return opset.multiconv(_inputs, _dilations, {}, _pads, {}, _strides,
                           _options.availableMemoryProportions,
                           _options.partialsTypes, _options.planType,
                           _options.perConvReservedTiles, _options.cycleBackOff,
                           _options.enableConvDithering);
  }

private:
  // Aggregated inputs for all the convs that are fused as a multiconv
  std::vector<std::vector<popart::TensorId>> _inputs;
  std::vector<std::vector<int64_t>> _dilations;
  std::vector<std::vector<int64_t>> _kernel_shape;
  std::vector<std::vector<int64_t>> _pads;
  std::vector<std::vector<int64_t>> _strides;
  popart::MultiConvOptions _options = {{}, {}};
};

} // namespace detail
} // namespace popart_compiler
} // namespace poptorch


================================================
FILE: popart_compiler/source/include/popart_compiler/SessionOptionsImpl.hpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#pragma once

#include <functional>
#include <map>
#include <set>
#include <string>
#include <utility>

#include "popart/sessionoptions.hpp"
#include "popart_compiler/CompilerOptions.hpp"

namespace poptorch {
namespace popart_compiler {
namespace detail {

struct SessionOptionsImpl {
  SessionOptionsImpl();

  std::map<std::string, std::function<void(bool)>> bool_options;
  std::map<std::string, std::function<void(std::uint64_t)>> uint64_options;
  std::map<std::string, std::function<void(std::string)>> string_options;
  std::map<std::string, std::function<void(double)>> double_options;

  std::map<std::string,
           std::function<void(std::pair<std::string, std::string>)>>
      container_options;
  std::set<std::string> options_set;

  popart::SessionOptions popart_options;
  CompilerOptions poptorch_options;

  void setMemoryProportion(std::uint32_t ipu, float memory) {
    poptorch_options.available_memory_proportion[ipu] = memory;
  }

  template <typename ValueType>
  void set(const std::string &key, ValueType value,
           std::map<std::string, std::function<void(ValueType)>> &options,
           const std::string &typeStr) {
    const auto it = options.find(key);
    ERROR_ON_MSG(it == options.end(),
                 "Unknown " << typeStr << " option " << key);

    it->second(value);
    options_set.insert(key);
  }
};

} // namespace detail
} // namespace popart_compiler
} // namespace poptorch


================================================
FILE: popart_compiler/types/include/popart_compiler/CompilerTypes.hpp
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
#pragma once

#include <functional>
#include <utility>
#include <vector>

#include "popart_compiler/PopartEnums.hpp"

// This header should contain ABI agnostic data types which are
// used to share data with other PopTorch components.
// Types in this file must not depend on external symbols.
namespace poptorch {
namespace popart_compiler {

// PopTorch abstraction of popart::MutableVoidData to be used across the ABI
// boundary
struct TensorMetadata {
  const char *id;
  std::vector<int64_t> shape;
  const char *dtype;
  void *data = nullptr;
  int64_t num_bytes = -1;
};

/*
  We use this callback structure to capture data from the poptorch python
  frontend. We get the function to call as well as pointers to the output/input
  storage waiting on CPU. From this we derive more data, see
  CallbackInternalMetadata in CompilerImpl.hpp.
*/
struct CallbackMetadata {
  // The thing we are calling back.
  std::function<void()> the_callback;

  // Due to tracing complexities we have to register the buffers as a seperate
  // step after the model has been traced.
  std::function<void()> buffer_registration_callback;

  // Pointers to the buffers we created on host.
  std::vector<void *> input_pointers;
  std::vector<void *> output_pointers;
};

using TensorId = std::size_t;

static constexpr TensorId NoneTensor = 0; // NOLINT

enum class OutputElemType { Tensor, Tuple, List };

// For testing only: throw an exception of the selected type.
enum class TestErrorType {
  Poptorch,
  Popart,
  PopartInternal,
  Poplibs,
  PoplarUnrecoverable,
  PoplarUnknown,
  PoplarRecoverableFullReset,
  PoplarLinkError
};

struct OutputTypeShape {
  OutputElemType type;
  int64_t num_elements{0};
};

struct Timestamps {
  std::vector<std::vector<double>> input;
  std::vector<std::vector<double>> input_complete;
  std::vector<std::vector<double>> output;
  std::vector<std::vector<double>> output_complete;
};

struct Optimizer {
  struct Parameter {
    char name[32];
    float value;
    bool is_const;
  };
  using ParamType = std::pair<float, bool>;

  explicit Optimizer(OptimizerType t, bool useTfVariant)
      : type(t), accum_types_provided(false), use_tf_variant(useTfVariant) {}
  explicit Optimizer(OptimizerType t, bool useTfVariant, float maxGradNorm)
      : type(t), accum_types_provided(false), use_tf_variant(useTfVariant),
        max_grad_norm(maxGradNorm) {}
  Optimizer(OptimizerType t, bool accumType, bool firstOrderType,
            bool secondOrderType, bool useTfVariant, float maxGradNorm)
      : type(t), accum_types_provided(true), accum_type_is_half(accumType),
        first_order_momentum_accum_type_is_half(firstOrderType),
        second_order_momentum_accum_type_is_half(secondOrderType),
        use_tf_variant(useTfVariant), max_grad_norm(maxGradNorm) {}

  OptimizerType type;
  // True if the main, first and second order accum types have been set.
  bool accum_types_provided;
  // Special parameters for adam/lamb. If true accumulations will be half
  // otherwise will be float.
  bool accum_type_is_half;
  bool first_order_momentum_accum_type_is_half;
  bool second_order_momentum_accum_type_is_half;
  bool use_tf_variant;
  float max_grad_norm;

  std::vector<Parameter> parameters;
};

} // namespace popart_compiler
} // namespace poptorch


================================================
FILE: popart_compiler/types/include/popart_compiler/PopartEnums.hpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#ifndef POPART_COMPILER_POPART_ENUMS_HPP
#define POPART_COMPILER_POPART_ENUMS_HPP
#include <string>

#include "poptorch_logging/Error.hpp"

namespace poptorch {
namespace popart_compiler {

/*
 * We maintain an ABI boundary inbetween PopART and Torch JIT. This avoids the
 * issue of torch being compiled with different CXX_ABI versions. However it
 * means we must replicate PopART enums here so they can be shared by both.
 */

// The training optimizer algorithm used.
enum class OptimizerType : std::uint8_t {
  SGD1 = 0,
  SGD2,
  ADAM,
  ADAMW,
  ADAMW_NO_BIAS,
  RMSPROP,
  RMSPROP_CENTERED,
  LAMB,
  LAMB_NO_BIAS,
  NONE
};

#define FOR_ALL_FIXED_POINT_TYPES(_)                                           \
  _(UINT8)                                                                     \
  _(INT8)                                                                      \
  _(UINT16)                                                                    \
  _(INT16)                                                                     \
  _(INT32)                                                                     \
  _(INT64)                                                                     \
  _(UINT32)                                                                    \
  _(UINT64)                                                                    \
  _(BOOL)

#define FOR_ALL_FLOATING_POINT_TYPES(_)                                        \
  _(FLOAT)                                                                     \
  _(FLOAT16)                                                                   \
  _(BFLOAT16)                                                                  \
  _(FLOAT8_143)                                                                \
  _(FLOAT8_152)                                                                \
  _(DOUBLE)                                                                    \
  _(COMPLEX64)                                                                 \
  _(COMPLEX128)

#define FOR_ALL_POPART_TYPES(_)                                                \
  FOR_ALL_FIXED_POINT_TYPES(_)                                                 \
  FOR_ALL_FLOATING_POINT_TYPES(_)                                              \
  _(STRING)                                                                    \
  _(UNDEFINED)

// The types supported by popart.
#define DEFINE_ENUM(value) value,
enum class PopartType { FOR_ALL_POPART_TYPES(DEFINE_ENUM) };
#undef DEFINE_ENUM

#define DEFINE_CASE(value)                                                     \
  case PopartType::value: {                                                    \
    return #value;                                                             \
  }

inline std::string toPopartTypeStr(const PopartType &type) {
  switch (type) {
    FOR_ALL_POPART_TYPES(DEFINE_CASE)
  default:
    ERROR("Unsupported PopartType");
  }
}
#undef DEFINE_CASE

// See AnchorReturnTypeId in popart/dataflow.hpp for a full description of each.
// Must be kept in sync with OutputMode in python/enums.py
enum class PopartOutputMode : std::uint8_t { Final = 0, EveryN, All, Sum, N };

// Must be static so each library gets its own copy,  __attribute__((unused)) is
// to silence the warning if it is unused in any of them.
static PopartOutputMode outputModeFromString(const std::string &str)
    __attribute__((unused));
static const char *outputModeToString(PopartOutputMode type)
    __attribute__((unused));

static PopartOutputMode outputModeFromString(const std::string &str) {
  if (str == "FINAL") {
    return PopartOutputMode::Final;
  }
  if (str == "EVERYN") {
    return PopartOutputMode::EveryN;
  }
  if (str == "ALL") {
    return PopartOutputMode::All;
  }
  if (str == "SUM") {
    return PopartOutputMode::Sum;
  }

  ERROR("Internal error: unsupported output mode :" << str);
}

// Popart only supports a string interface for them so we have to convert back.
static const char *outputModeToString(PopartOutputMode type) {
  switch (type) {
  case PopartOutputMode::Final:
    return "FINAL";
  case PopartOutputMode::EveryN:
    return "EVERYN";
  case PopartOutputMode::All:
    return "ALL";
  case PopartOutputMode::Sum:
    return "Sum";
  default:
    ERROR("UNREACHABLE: Converting output mode to string");
  }
}

} // namespace popart_compiler
} // namespace poptorch

#endif // POPART_COMPILER_POPART_ENUMS_HPP


================================================
FILE: poptorch/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
project(poptorch)

add_library(poptorch_internal_headers INTERFACE)
target_include_directories(poptorch_internal_headers INTERFACE include source/include)

# Ensure ABI matches that of PyTorch
add_definitions(${TORCH_CXX_FLAGS})

add_library(poptorch SHARED
  "source/AddDetachOperations.cpp"
  "source/AddSubgraphConnectionNodes.cpp"
  "source/AliasProcessing.cpp"
  "source/CPUOffloadingCleanUp.cpp"
  "source/ErrorOnUnsupportedAten.cpp"
  "source/FixupSetAvailableMemory.cpp"
  "source/GatherWithExpandedIndicesOptimization.cpp"
  "source/ImplicitCasting.cpp"
  "source/InplaceOps.cpp"
  "source/LowerToPopart.cpp"
  "source/LowerToPopartFactories.cpp"
  "source/OpBuilder.cpp"
  "source/OverlappedIO.cpp"
  "source/PopartCanonicalization.cpp"
  "source/PopartLateCanonicalization.cpp"
  "source/PoplarExecutable.cpp"
  "source/PoptorchSymbols.cpp"
  "source/RemoveSurplusIdentityLosses.cpp"
  "source/RequiresGrad.cpp"
  "source/GNNOptimizations.cpp"
  "source/SessionOptionsParser.cpp"
  "source/Utils.cpp"
  "source/popart_canonicalization/ActivationOps.cpp"
  "source/popart_canonicalization/ArithmeticOps.cpp"
  "source/popart_canonicalization/AtenHandlers.gen.cpp"
  "source/popart_canonicalization/BilinearOps.cpp"
  "source/popart_canonicalization/BitwiseOps.cpp"
  "source/popart_canonicalization/BlasOps.cpp"
  "source/popart_canonicalization/ConstantOps.cpp"
  "source/popart_canonicalization/ConvolutionOps.cpp"
  "source/popart_canonicalization/CustomOps.cpp"
  "source/popart_canonicalization/DistanceOps.cpp"
  "source/popart_canonicalization/DropoutOps.cpp"
  "source/popart_canonicalization/EinsumOp.cpp"
  "source/popart_canonicalization/EmbeddingOps.cpp"
  "source/popart_canonicalization/IndexOps.cpp"
  "source/popart_canonicalization/LossOps.cpp"
  "source/popart_canonicalization/NormalizationOps.cpp"
  "source/popart_canonicalization/OtherOps.cpp"
  "source/popart_canonicalization/PoolingOps.cpp"
  "source/popart_canonicalization/PopartCanonicalizationUtils.cpp"
  "source/popart_canonicalization/PoptorchHandlers.gen.cpp"
  "source/popart_canonicalization/PyGTorchScatterOps.cpp"
  "source/popart_canonicalization/PyGTorchSplineConvOps.cpp"
  "source/popart_canonicalization/RNNOps.cpp"
  "source/popart_canonicalization/RandomSamplingOps.cpp"
  "source/popart_canonicalization/ReduceOps.cpp"
  "source/popart_canonicalization/ReshapeOps.cpp"
  "source/popart_canonicalization/SliceOps.cpp"
  "source/popart_canonicalization/SoftmaxOps.cpp"
  "source/popart_canonicalization/ScatterReduction.cpp"
  "source/popart_canonicalization/TensorOps.cpp"
  "source/popart_canonicalization/pyg_torch_cluster/FpsOp.cpp"
  "source/popart_canonicalization/pyg_torch_cluster/GridOp.cpp"
  "source/popart_canonicalization/pyg_torch_cluster/NearestOp.cpp"

  "source/type_and_constant_canonicalization/AddListNumElements.cpp"
  "source/type_and_constant_canonicalization/CanonicaliseConstants.cpp"
  "source/type_and_constant_canonicalization/CastUnsupportedInputs.cpp"
  "source/type_and_constant_canonicalization/CheckAndChangeOutputTypes.cpp"
  "source/type_and_constant_canonicalization/EvaluateConstexprs.cpp"
  "source/type_and_constant_canonicalization/MakeConstantIntParams.cpp"
)

file(GLOB_RECURSE poptorch_public_headers "${CMAKE_CURRENT_SOURCE_DIR}/include/*.hpp*")

set_target_properties(poptorch PROPERTIES
  CXX_STANDARD 17
  PUBLIC_HEADER "${poptorch_public_headers}")

target_link_libraries(poptorch PUBLIC
                               popart_compiler_types
                               PRIVATE
                               dispatch_tracer
                               popart_compiler
                               poptorch_logging
                               torch
                               stdc++fs
                               )


target_include_directories(poptorch
                           PUBLIC
                           $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
                           $<INSTALL_INTERFACE:include>
                           PRIVATE
                           source/include)


install(TARGETS poptorch
  LIBRARY
    DESTINATION ${CMAKE_INSTALL_LIBDIR}
  PUBLIC_HEADER
    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/poptorch
  )


================================================
FILE: poptorch/include/poptorch/DispatchTracer.hpp
================================================
// Copyright (c) 2021 Graphcore Ltd. All rights reserved.
#ifndef INCLUDE_POPTORCH_DISPATCH_TRACER_HPP_
#define INCLUDE_POPTORCH_DISPATCH_TRACER_HPP_

#include <cstdint>
#include <functional>
#include <memory>
#include <string>
#include <vector>

namespace at {
class Tensor;
}

namespace torch {
namespace jit {
struct Graph;
struct Node;
struct Value;
} // namespace jit
} // namespace torch

namespace poptorch {

struct CompilerOptions;
struct InplaceGraphInfo;
struct PoptorchErrorInfo;

// Toggled by the user in python to choose which backend to target when tracing.
// CPU and SENTINEL will only be toggled by us.
enum TracingMode {
  // Compile normal JIT to run via PopART
  POPART,
};

struct PerReplicaSettings {
  int comm_group_type;
  int shards;
  int variable_retrieval_mode;
  int64_t size0;
  std::shared_ptr<std::vector<char>> host_buffer;
};

// Create a new graph.
void createGraph(TracingMode mode, const std::vector<at::Tensor> &inputs,
                 const CompilerOptions &options);

// The current graph is complete: finalize it.
//
// Trying to add ops after this call is undefined behaviour.
void finalizeGraph();

InplaceGraphInfo getInplaceGraphInfo(size_t num_anchors,
                                     bool replicas_needing_broadcast);

// Get the captured JIT graph. In reality is just returning the
// torch::jit::Graph it's already been compiling during the dispatch process.
std::shared_ptr<torch::jit::Graph> getTracedGraph();

// Get a pointer to the data source for an IPU input / parameter tensor.
// If the value is not a parameter or an input, return nullptr.
void *getDataSource(const at::Tensor &tensor);

void setParameterName(const at::Tensor &tensor, const std::string &name);

// Return the name of a parameter or an empty string if no name was set.
std::string getParameterName(torch::jit::Value *value);

void setParameterPerReplica(const std::string &param_name,
                            const at::Tensor &tensor, int comm_group_type,
                            int shards, int variable_retrieval_mode);

bool getParameterPerReplica(torch::jit::Value *value,
                            PerReplicaSettings &settings);

// Get a pointer to the data source for a given JIT value.
// The value must be an IPU value.
// If the value is not a parameter or an input, return nullptr.
void *getDataSourceForValue(torch::jit::Value *value);

// Return true if the given IPU tensor is a parameter.
bool isParameter(torch::jit::Value *value);

// Start capturing calls.
void startDispatch();

// Stop capturing calls.
void endDispatch(bool error_occurred = false);

// Called before starting to move parameters between the CPU and the IPU.
// (This is used to differentiate inputs from parameters / buffers)
// We expect something like:
// >>> poptorch_core.startParametersMove()
// >>> my_model.to("ipu")
// >>> poptorch_core.endParametersMove()
void startParametersMove();
void endParametersMove();

// Called before starting to move outputs from the IPU to the CPU.
// Allows us to error if an attempt is made to move outputs outside
// of IPUScope.outputs().
void startOutputsMove();
void endOutputsMove();

// Return true if we should be compiling with the dispatcher.
bool isCompilingWithDispatcher();

// Cleans up all objects associated with poptorch
void poptorchAtExit();

// Destroy the active dispatcher object.
void destroyDispatcher();

void replaceValueDispatcher(torch::jit::Value *v_old, torch::jit::Value *v_new);

std::uint64_t getIpuTensorId(const at::Tensor &tensor);

using PoptorchErrorThrower = std::function<void(const PoptorchErrorInfo &info)>;

// Set the function to use to throw python PoptorchError exceptions.
void setPoptorchErrorThrower(PoptorchErrorThrower thrower);

// Throw an exception using the poptorch error thrower.
// Note: used by RegisterAtenOverloads.cpp in a template, that's why it needs
// to be declared publicly.
void throwPoptorchError(const PoptorchErrorInfo &info);

} // namespace poptorch

#endif // INCLUDE_POPTORCH_DISPATCH_TRACER_HPP_


================================================
FILE: poptorch/include/poptorch/InplaceOps.hpp
================================================
// Copyright (c) 2021 Graphcore Ltd. All rights reserved.
#ifndef INCLUDE_POPTORCH_INPLACE_OPS_H
#define INCLUDE_POPTORCH_INPLACE_OPS_H

#include <algorithm>
#include <limits>
#include <memory>
#include <unordered_map>
#include <vector>

namespace c10 {
struct Symbol;
} // namespace c10

namespace torch {
namespace jit {
struct Graph;
struct Node;
using NodeKind = c10::Symbol;
struct Value;
} // namespace jit
} // namespace torch

namespace poptorch {

// Store information related to Graph inputs modified in place.
struct InplaceGraphInfo {
  // Mapping for a graph input which is not modified in place.
  static constexpr size_t no_mapping = std::numeric_limits<std::size_t>::max();

  // Number of outputs from the graph which are not used to emulate
  // inplace ops. (An output may be a list or tuple as well as a tensor).
  size_t num_normal_outputs{0};

  // Number of tensors output from the graph which are not used to
  // emulate inplace ops. (This differs from the previous if the graph returns
  // one or more tuples/lists.)
  size_t num_tensor_outputs{0};

  // Mapping between each input tensor and the output tensor used
  // to update the input. If the input tensor is not changed in place, it will
  // be equal to InplaceGraphInfo::no_mapping
  //
  // Note: these are all Graph inputs (inputs and parameters) but only inputs
  // can have a mapping.
  //
  // If the input at graph_input_idx is modified in place:
  //   m[graph_input_idx] = graph_output_idx
  // else
  //   m[graph_input_idx] = no_mapping
  std::vector<std::size_t> input_output_mapping{};
};

// Get the NodeKind corresponding to the outplace version of the given
// inplace op NodeKind
torch::jit::NodeKind outplaceKind(torch::jit::NodeKind kind);

class InplaceInputsTracker {
public:
  void addTensor(torch::jit::Value *input);
  // Find if the given value is an alias for an input, if so remove the alias
  // from the tracker and return the input it was aliasing. If the given value
  // doesn't alias an input return nullptr.
  torch::jit::Value *eraseCurrentAlias(torch::jit::Value *alias);
  void registerAlias(torch::jit::Value *aliased_input,
                     torch::jit::Value *alias);
  InplaceGraphInfo finalizeGraph(torch::jit::Graph &graph, size_t num_anchors,
                                 bool replicas_needing_broadcast);

private:
  // alias -> aliased
  std::unordered_map<torch::jit::Value *, torch::jit::Value *> _aliases;
};

void fixForLoopInputs(torch::jit::Graph &graph);

void verifyIfElseBlocksOrder(const torch::jit::Graph &graph);
} // namespace poptorch

#endif // INCLUDE_POPTORCH_INPLACE_OPS_H


================================================
FILE: poptorch/include/poptorch/LowerToPopart.hpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#ifndef INCLUDE_POPTORCH_LOWER_TO_POPART_H
#define INCLUDE_POPTORCH_LOWER_TO_POPART_H

#include <torch/csrc/jit/ir/ir.h>

#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>

#include "popart_compiler/Compiler.hpp"
#include "popart_compiler/PopartEnums.hpp"
#include "poptorch/PoplarExecutable.hpp"
#include "poptorch/SessionOptionsParser.hpp"

namespace poptorch {
namespace popart_compiler {
class SessionOptions;
}

namespace detail {
class LowerToPopartImpl;
} // namespace detail

// CallbackMetadata is used to pass information from python to the poplar custom
// op for CPU ops. The string is the ID given by the user to each op.
using CPUCallbackMap =
    std::unordered_map<std::string, popart_compiler::CallbackMetadata>;

struct Anchor {
  Anchor(std::string n, std::uint8_t m, size_t p)
      : name(std::move(n)), mode(m), period(p) {}

  std::string name;
  std::uint8_t mode;
  size_t period;
};

using AnchorList = std::vector<Anchor>;

/*
 * Take the transformed graph and create a poponnx graph from it.
 */

struct InplaceGraphInfo;

class LowerToPopart {
public:
  LowerToPopart(torch::jit::Graph *graph, InplaceGraphInfo &&inplace_info,
                bool training, std::vector<popart_compiler::Optimizer> &&opt,
                const popart_compiler::SessionOptions &options,
                const AttributeAccessor &attribute_accessor,
                CPUCallbackMap callback, AnchorList &&anchors);
  LowerToPopart(LowerToPopart &&lower);
  ~LowerToPopart();

  void lower();
  std::shared_ptr<poptorch::PoplarExecutable> compile();
  std::shared_ptr<poptorch::PoplarExecutable>
  loadExecutableFromFile(const std::string &input_filename);

private:
  std::unique_ptr<detail::LowerToPopartImpl> _impl;
};

} // namespace poptorch

#endif // INCLUDE_POPTORCH_LOWER_TO_POPART_H


================================================
FILE: poptorch/include/poptorch/LowerToPopartFactories.hpp
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
#ifndef INCLUDE_POPTORCH_LOWER_TO_POPART_FACTORIES_H
#define INCLUDE_POPTORCH_LOWER_TO_POPART_FACTORIES_H

#include <torch/csrc/jit/ir/ir.h>

#include <memory>
#include <string>
#include <vector>

#include "poptorch/LowerToPopart.hpp"
#include "poptorch/SessionOptionsParser.hpp"

namespace poptorch {

poptorch::LowerToPopart lowerToPopartFromDispatch(
    SessionOptionsParser &parser, bool training, AnchorList &&anchors_list,
    const std::function<void()> &initCallbackBuffers,
    std::vector<popart_compiler::Optimizer> &&optimizers,
    const AttributeAccessor &attribute_accessor, CPUCallbackMap &callbacks);
} // namespace poptorch

#endif // INCLUDE_POPTORCH_LOWER_TO_POPART_FACTORIES_H


================================================
FILE: poptorch/include/poptorch/PoplarExecutable.hpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#ifndef INCLUDE_POPTORCH_POPLAR_EXECUTABLE_HPP
#define INCLUDE_POPTORCH_POPLAR_EXECUTABLE_HPP

#include <torch/csrc/jit/ir/ir.h>

#include <map>
#include <set>
#include <string>
#include <utility>
#include <vector>

#include "popart_compiler/Compiler.hpp"
#include "poptorch/InplaceOps.hpp"

namespace poptorch {

class PoplarExecutable {
public:
  PoplarExecutable() = delete;
  PoplarExecutable(popart_compiler::Compiler &&c,
                   std::vector<popart_compiler::TensorId> &&inputs,
                   std::vector<popart_compiler::TensorId> &&outputs,
                   std::vector<at::ScalarType> &&outputTypes,
                   std::vector<std::string> parameter_names,
                   InplaceGraphInfo &&inplace_info)
      : _compiler(std::move(c)), _popart_inputs(inputs),
        _popart_outputs(outputs), _popart_output_types(outputTypes),
        _parameter_names(std::move(parameter_names)),
        _inplace_info(std::move(inplace_info)) {
    for (size_t i = 0; i < inputs.size(); i++) {
      _converted_inputs.emplace_back();
    }
  }

  void loadEngineAndConnectStreams();
  /*
   * Execute the compiled graph stored in field "compiler" with the given
   * |inTensors| and return to the user the resulting tensors if any.
   */
  std::vector<at::IValue> run(std::vector<at::Tensor> &inTensors);

  void
  updateOptimizers(const std::vector<popart_compiler::Optimizer> &optimizer);

  // Tell popart to copy weights off the IPU and write into host memory.
  void copyWeightsToHost(const std::map<std::string, void *> &buffers);

  // Tell popart to copy weights from host into IPU memory.
  void copyWeightsToDevice(const std::map<std::string, void *> &buffers);

  // Tell popart to copy named buffers from host into IPU memory.
  void copyNamedBuffersToDevice(const std::map<std::string, void *> &buffers);

  const std::vector<popart_compiler::OutputTypeShape> &outputTypes() const;

  // Get the IR from popart.
  std::string getPopartIR() const;

  // Get the tensor names that occur in the model graphs.
  std::set<std::string> getTensorNames() const;

  void detachFromDevice();
  void attachToDevice();
  bool isAttachedToDevice() const;

  const popart_compiler::Compiler &getCompiler() const { return _compiler; }
  popart_compiler::Compiler &getCompiler() { return _compiler; }

private:
  popart_compiler::Compiler _compiler;

  std::vector<popart_compiler::TensorId> _popart_inputs;

  // Used for types which need conversion to maintain the ref count
  std::vector<at::Tensor> _converted_inputs;

  std::vector<popart_compiler::TensorId> _popart_outputs;
  std::vector<at::ScalarType> _popart_output_types;
  const std::vector<std::string> _parameter_names;

  const InplaceGraphInfo _inplace_info;
};

} // namespace poptorch

#endif // INCLUDE_POPTORCH_POPLAR_EXECUTABLE_HPP


================================================
FILE: poptorch/include/poptorch/SessionOptionsParser.hpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#ifndef INCLUDE_POPTORCH_SESSION_OPTIONS_PARSER_HPP
#define INCLUDE_POPTORCH_SESSION_OPTIONS_PARSER_HPP

#include <torch/csrc/jit/ir/ir.h>

#include <map>
#include <memory>
#include <set>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>

#include "popart_compiler/CompilerTypes.hpp"
#include "poptorch_logging/Error.hpp"

namespace poptorch {
namespace popart_compiler {
class SessionOptions;
} // namespace popart_compiler

// Interface to parse a python object without adding a dependency on pybind
class IPyValue {
public:
  virtual std::function<void(int, int)> toFunction() const = 0;
  virtual bool isBoolean() const = 0;
  virtual bool toBoolean() const = 0;
  virtual bool isDouble() const = 0;
  virtual double toDouble() const = 0;
  virtual bool isInt() const = 0;
  virtual std::int64_t toInt64() const = 0;
  virtual std::uint64_t toUInt64() const = 0;
  virtual bool isString() const = 0;
  virtual std::string toString() const = 0;
  virtual bool isSetListOrTuple() const = 0;
  virtual void forEachInList(std::function<void(const IPyValue &)>) const = 0;
  virtual bool isDict() const = 0;
  virtual void forEachInDict(
      std::function<void(const IPyValue &, const IPyValue &)>) const = 0;
  // Return nullptr if the key doesn't exist
  virtual std::unique_ptr<IPyValue>
  getFromDict(const std::string &key) const = 0;
  // Return nullptr if index is out of bounds
  virtual std::unique_ptr<IPyValue> getFromList(std::uint64_t index) const = 0;
  virtual std::uint64_t getListSize() const = 0;
  virtual std::string type() const = 0;

  float toFloatWithRangeCheck() const;
  std::vector<std::string> toVectorString() const;
  virtual ~IPyValue() = default;
};

class SessionOptionsParser {
public:
  explicit SessionOptionsParser(const IPyValue &opts);
  popart_compiler::SessionOptions &options();
  ~SessionOptionsParser();

private:
  std::unique_ptr<popart_compiler::SessionOptions> _opts;
};

typedef std::function<std::unique_ptr<IPyValue>(const std::string &)>
    AttributeAccessor;

} // namespace poptorch

#endif // INCLUDE_POPTORCH_SESSION_OPTIONS_PARSER_HPP


================================================
FILE: poptorch/include/poptorch/Utils.hpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#ifndef INCLUDE_POPTORCH_UTILS_HPP
#define INCLUDE_POPTORCH_UTILS_HPP

#include <torch/csrc/jit/ir/ir.h>

#include <memory>
#include <string>
#include <unordered_set>
#include <utility>
#include <vector>

namespace poptorch {

torch::jit::Node *findEarliestUser(const torch::jit::Value *value);

bool isNondeterministic(const torch::jit::Node &node);

std::string nodeToString(const torch::jit::Node *node);

std::string scalarTypeToOnnxString(at::ScalarType type);

at::ScalarType onnxStrToScalarType(const char *type_str);

at::ScalarType coerceToSupportedType(at::ScalarType type);

torch::jit::Node *createAndInsertCastOp(torch::jit::Graph *graph,
                                        torch::jit::Value *val,
                                        at::ScalarType type);

// Returns a collapsed version of the graph input hierachy into a list of
// tensor values by following any tuples/lists and their unpacking
// N.B. if a tuple is not used (unpacked), the resulting values will be null
// as a placeholder.
std::vector<torch::jit::Value *>
collapsedGraphInputHierachy(torch::jit::Graph *graph);

// Return the number of tensors for a given type: in the case of a tensor
// this is 1, but in case of nested tuples, this is the sum over all.
size_t numTensorsForType(const c10::TypePtr &type);

// Delete a node and also its inputs if they are also unused.
void searchAndPossiblyDestroy(
    const std::unordered_set<torch::jit::Node *> &to_test);

// Remove all the node's inputs and destroy them if they're not used
// anywhere else.
void removeAndPossiblyDestroyAllInputs(torch::jit::Node *node);

std::unique_ptr<char[]> stringToUniquePtr(const std::string &str);

// Get the tensor shape as a vector of ints.
std::vector<std::int64_t> shapeFromTensor(const torch::jit::Value *value);

// Add casts as necessary such that weight and bias have the same scalar type
// as input.
void castWeightAndBias(torch::jit::Graph *graph, torch::jit::Value *input,
                       torch::jit::Value *&weight, torch::jit::Value *&bias);

// A replacement for PyTorch's ListType which includes the number of elements
// unlike PyTorch's own type.
class ListTypeWithNumElements
    : public c10::SingleElementType<torch::jit::TypeKind::ListType,
                                    ListTypeWithNumElements> {
public:
  ListTypeWithNumElements(c10::TypePtr elem_type, size_t num_elements)
      : SingleElementType(std::move(elem_type)), _num_elements(num_elements) {}

  bool equals(const Type &rhs) const override {
    if (auto rhs_cast = rhs.cast<ListTypeWithNumElements>()) {
      return numElements() == rhs_cast->numElements();
    }
    return false;
  }

  size_t numElements() const { return _num_elements; }

  std::string str() const override;

  c10::ListTypePtr getOriginalListType() const {
    return c10::ListType::create(getElementType());
  }

private:
  size_t _num_elements;

  std::string annotation_str_impl(c10::TypePrinter printer) const override {
    (void)(printer);
    return str();
  }
};
using ListTypeWithNumElementsPtr = std::shared_ptr<ListTypeWithNumElements>;

struct JitTensorInfo {
  explicit JitTensorInfo(const at::Tensor &tensor);
  explicit JitTensorInfo(torch::jit::Value *value);
  std::string toString() const;
  at::ScalarType scalar_type;
  std::vector<int64_t> dims;
};

void validateTensorShapeAndType(torch::jit::Value *value,
                                const at::Tensor &tensor);

// setNodeTensorAttrValue and getNodeTensorAttrValue must be used instead of
// node->t_(c10::attr::value, v) and node->t(c10::attr::value).
//
// When printing a torch::jit::Graph the graph will iterate over each node
// and print all its attributes.
// If an attribute is a tensor it will try to print the content of that
// tensor which in our case would trigger a copy from IPU to CPU. This copy
// not only will fail, it will also be interpreted as a request to add this
// tensor as a graph output which will corrupt the graph.
// However attributes which are arrays of tensors are not printed and therefore
// will not trigger a copy, so behind the scenes these functions will wrap
// and unwrap the tensor attribute in a size 1 vector.
void setNodeTensorAttrValue(torch::jit::Node *node,
                            torch::jit::TensorAttr::ConstructorType value);
const torch::jit::TensorAttr::ValueType &
getNodeTensorAttrValue(const torch::jit::Node *node);
} // namespace poptorch

#endif // INCLUDE_POPTORCH_UTILS_HPP


================================================
FILE: poptorch/source/AddDetachOperations.cpp
================================================
// Copyright (c) 2021 Graphcore Ltd. All rights reserved.

#include <torch/csrc/jit/ir/ir.h>

#include "PoptorchSymbols.hpp"
#include "poptorch/OpBuilder.hpp"
#include "poptorch/PopartCanonicalization.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

namespace poptorch {
namespace {

std::map<torch::jit::Value *, torch::jit::Value *> detached_values;
std::set<torch::jit::Node *> visited_nodes;

torch::jit::Value *possiblyDetachedValue(torch::jit::Graph *graph,
                                         torch::jit::Value *value) {
  auto *producer = value->node();
  auto producer_kind = producer->kind();

  if (value->requires_grad() || producer_kind == c10::prim::Constant ||
      producer_kind == symbols::poptorch::tensor_constant ||
      producer_kind == symbols::poptorch::host_side_tensor_constant ||
      producer_kind == symbols::popart::detach ||
      producer_kind == c10::prim::TupleConstruct ||
      producer_kind == c10::prim::ListConstruct) {
    return value;
  }

  auto it = detached_values.find(value);
  if (it == detached_values.end()) {
    WithNodeMetadata meta(producer);
    auto *detach = graph->create(symbols::popart::detach);
    detach->addInput(value);
    insertNodeAfterNode(detach, producer);
    detach->output(0)->setType(value->type());
    it = detached_values.insert({value, detach->output(0)}).first;
  }

  return it->second;
}

void maybeInsertDetachOp(torch::jit::Graph *graph, torch::jit::Node *node) {
  logging::LogContext ctx(
      "AddDetachOperations (maybeInsertDetachOp) processing " +
      nodeToString(node));

  if (visited_nodes.find(node) != visited_nodes.end()) {
    return;
  }
  visited_nodes.insert(node);

  for (torch::jit::Value *input : node->inputs()) {
    auto *detach = possiblyDetachedValue(graph, input);
    if (input == detach) {
      maybeInsertDetachOp(graph, input->node());
    }
  }
}

void replaceDetachedValues(torch::jit::Node *node) {
  logging::LogContext ctx(
      "AddDetachOperations (replaceDetachedValues) processing " +
      nodeToString(node));

  if (visited_nodes.find(node) != visited_nodes.end()) {
    return;
  }
  visited_nodes.insert(node);

  for (torch::jit::Value *input : node->inputs()) {
    auto it = detached_values.find(input);
    if (it != detached_values.end()) {
      if (node->kind() == symbols::popart::detach) {
        // Only replace values (with their detached counterparts) that exist
        // after the detach node that generated the detached value.
        return;
      }
      node->replaceInputWith(input, it->second);
    }
    replaceDetachedValues(input->node());
  }
}

} // namespace

void addDetachOperations(torch::jit::Graph *graph) {
  detached_values.clear();
  visited_nodes.clear();

  // Special prim::Param nodes that correspond to graph inputs should not be
  // detached so we superficially mark them as detached before processing.
  for (torch::jit::Value *input : graph->inputs()) {
    visited_nodes.insert(input->node());
  }

  // Process the graph recursively and replace the values at the end.
  maybeInsertDetachOp(graph, graph->return_node());

  visited_nodes.clear();
  for (torch::jit::Value *input : graph->inputs()) {
    visited_nodes.insert(input->node());
  }
  replaceDetachedValues(graph->return_node());
}

} // namespace poptorch


================================================
FILE: poptorch/source/AddSubgraphConnectionNodes.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.

#include <torch/csrc/jit/ir/ir.h>

#include <functional>
#include <random>
#include <stack>

#include "PoptorchSymbols.hpp"
#include "poptorch/OpBuilder.hpp"
#include "poptorch/PopartCanonicalization.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

namespace poptorch {

namespace {

// A small class to keep track of information regarding subgraphs.
struct Subgraph {
  // All the nodes in the subgraph.
  std::unordered_set<torch::jit::Node *> nodes;

  // Track the inputs already added so we don't double count them.
  std::unordered_set<torch::jit::Value *> added_inputs;

  // Map of new inputs to old inputs.
  std::unordered_map<torch::jit::Value *, torch::jit::Value *> input_map;

  // Map of old inputs to the new ones.
  std::unordered_map<torch::jit::Value *, torch::jit::Value *>
      reverse_input_map;
};

bool isTerminator(const torch::jit::Node *node) {
  return node->kind() == symbols::poptorch::end_for_loop;
}

bool isUsedInTerminator(const torch::jit::Node *node) {
  for (const torch::jit::Value *output : node->outputs()) {
    for (const torch::jit::Use &use : output->uses()) {
      const torch::jit::Node *user = use.user;

      if (isTerminator(user)) {
        return true;
      }
    }
  }

  return false;
}

bool markInputsAsComingFromParent(torch::jit::Graph *graph,
                                  torch::jit::Node *node, Subgraph *subgraph,
                                  const bool inputFromParent = true) {
  bool used_in_subgraph = false;
  WithNodeMetadata meta(node);

  // If this node is NOT used in the terminator then we need to add it as an
  // input to the graph.
  for (torch::jit::Value *value : node->inputs()) {
    // If the user isn't used in this subgraph AND the node hasn't already
    // been marked an input.
    if (subgraph->nodes.count(value->node()) == 0) {
      if (subgraph->added_inputs.count(value) == 0) {
        if (!inputFromParent) {
          torch::jit::Node *new_out = createAddUntypedInputTensor(graph, value);
          subgraph->input_map.insert({new_out->output(), value});
          subgraph->reverse_input_map.insert({value, new_out->output()});
        }
        subgraph->added_inputs.insert(value);
        used_in_subgraph = true;
      }
    } else {
      used_in_subgraph = true;
    }
  }

  return used_in_subgraph;
}

void markOutputs(torch::jit::Graph *graph, torch::jit::Node *outputs,
                 torch::jit::Node *insertion_point, Subgraph *subgraph) {
  torch::jit::WithInsertPoint insert_point(outputs);

  // Sometimes the return might not be processed in this node.
  const bool used_in_subgraph =
      markInputsAsComingFromParent(graph, outputs, subgraph);

  for (torch::jit::Value *output : outputs->inputs()) {
    WithNodeMetadata meta{output->node()};
    // Add an identity op in lieu if the op isn't used in the subgraph to make
    // sure popart handles the alias correctly.
    if (!used_in_subgraph) {
      torch::jit::Node *node = createIdentity(graph, {output});
      output = node->output();
    }

    // PopART doesn't allow inputs to be outputs directly.
    if (subgraph->reverse_input_map.find(output) !=
        subgraph->reverse_input_map.end()) {
      output = subgraph->reverse_input_map[output];
    }

    torch::jit::Node *new_node = createAddOutputTensor(graph, output);
    insertNodeBeforeNode(new_node, insertion_point);
  }
}

struct InsertionPointAndShape {
  torch::jit::Node *insertion_point;
  std::vector<std::int64_t> shape;
};
using ReshapePutterHelper = std::vector<InsertionPointAndShape>;

void markCondOutputs(torch::jit::Graph *graph, torch::jit::Node *outputs,
                     torch::jit::Node *insertion_point, Subgraph *subgraph,
                     ReshapePutterHelper &reshape_putter_helper,
                     bool processingElseOutputs = false) {
  torch::jit::WithInsertPoint insert_point(outputs);

  // Sometimes the return might not be processed in this node.
  const bool used_in_subgraph =
      markInputsAsComingFromParent(graph, outputs, subgraph);

  at::ArrayRef<torch::jit::Value *> inputs = outputs->inputs();
  for (size_t idx = 0; idx < inputs.size(); idx++) {
    torch::jit::Value *output = inputs[idx];

    WithNodeMetadata meta{output->node()};

    // Output tensor shape has to be read before adding IdentityOp as the shape
    // info does not propagate to the op output.
    const auto output_shape = shapeFromTensor(output);

    // Add an identity op in lieu if the op isn't used in the subgraph to make
    // sure popart handles the alias correctly.
    if (!used_in_subgraph) {
      torch::jit::Node *node = createIdentity(graph, {output});
      output = node->output();
    }

    // PopART doesn't allow inputs to be outputs directly.
    if (subgraph->reverse_input_map.find(output) !=
        subgraph->reverse_input_map.end()) {
      output = subgraph->reverse_input_map[output];
    }

    if (processingElseOutputs) {
      // Processing the else branch of the cond op. Here we make sure the
      // outputs of the branches have the same shapes. If not, we add a reshape
      // in the `then` branch.
      const auto &then_out_shape = reshape_putter_helper[idx].shape;
      const auto &else_out_shape = output_shape;
      if (else_out_shape.empty()) {
        ERROR("`else` branch output has an empty shape, so adding a reshape "
              "op to the `then` branch to achieve shapes identity is not "
              "possible!");
      }

      if (then_out_shape != else_out_shape) {
        // In case if branches output shapes differ, there is a reshape added:
        // 1. Create a reshape op
        torch::jit::Node *reshape_node = nullptr;
        {
          torch::jit::WithInsertPoint reshape_insert_point(
              reshape_putter_helper[idx].insertion_point);
          auto *tensor_to_reshape =
              reshape_putter_helper[idx].insertion_point->input();
          reshape_node =
              createReshape(graph, tensor_to_reshape, else_out_shape);
        }

        // 2. Create a new output tensor of the `then` branch (being the reshape
        // output) and insert it before the original output tensor op.
        torch::jit::Node *new_then_output_node =
            createAddOutputTensor(graph, reshape_node->output());
        insertNodeBeforeNode(new_then_output_node,
                             reshape_putter_helper[idx].insertion_point);

        // 3. Remove the original output tensor op returning the wrongly shaped
        // tensor.
        reshape_putter_helper[idx].insertion_point->destroy();
      }
      // Create the output tensor of the `else` branch.
      torch::jit::Node *else_output_node = createAddOutputTensor(graph, output);
      insertNodeBeforeNode(else_output_node, insertion_point);

    } else {
      // Create the output tensor of the `then` branch.
      // In case the output tensor turns out to be of a different shape then
      // `else` branch'es one, it will be replaced with the reshaped output
      // tensor.
      torch::jit::Node *then_output_node = createAddOutputTensor(graph, output);
      insertNodeBeforeNode(then_output_node, insertion_point);

      reshape_putter_helper.push_back(
          {then_output_node, shapeFromTensor(output)});
    }
  }
}

void insertSetAttribute(torch::jit::Graph *graph, size_t cond_nest_lvl,
                        torch::jit::Node *insertion_point,
                        std::mt19937 &random_gen,
                        bool after_insert_pnt = false) {
  torch::jit::WithInsertPoint set_attr_insert_point(insertion_point);
  WithNodeMetadata meta{insertion_point};
  std::uniform_int_distribution<> distribution;
  const std::string id{"cond_id_" + std::to_string(distribution(random_gen))};
  const std::string cond_context{"cond_context_" +
                                 std::to_string(cond_nest_lvl)};
  createSetAttribute(graph, "__outline", cond_context, id, after_insert_pnt);
}

void insertClearAttribute(torch::jit::Graph *graph, size_t cond_nest_lvl,
                          torch::jit::Node *insertion_point,
                          bool after_insert_pnt = false) {
  torch::jit::WithInsertPoint clr_attr_insert_point(insertion_point);
  WithNodeMetadata meta{insertion_point};
  const std::string cond_context =
      "cond_context_" + std::to_string(cond_nest_lvl);
  createClearAttribute(graph, "__outline", cond_context, after_insert_pnt);
}

// State during the dispatcher intercept calls.
std::stack<torch::jit::Node *> start_for_loop_nodes;

} // namespace

/*
 * Certain ops are essentially subgraphs within the main graph. For instance
 * for loops. If they have a tensor which comes from the subgraph
 * above we must add a specific input entry op to the graph for that op.
 */
void annotateSubgraphs(torch::jit::Graph *graph, torch::jit::Node *start_node) {
  logging::LogContext ctx_func("annotateSubgraphs Processing");
  // Subgraph start to all nodes contained directly within that subgraph.
  std::stack<Subgraph> subgraph_nodes;

  // Nodes to delete (if they are truely unused).
  std::unordered_set<torch::jit::Node *> to_delete;

  // Helper struct for processing if_else.
  std::stack<ReshapePutterHelper> reshape_putter_helpers_stack;

  // Random generator - used to generate ids for cond operator entities
  std::mt19937 random_gen(0);

  // Look for any subgraphs. Subgraphs are currently:
  // * for loops.
  for (auto iter = start_node->iterator(); iter != graph->nodes().end();
       ++iter) {
    torch::jit::Node *node = *iter;
    logging::LogContext ctx("Processing " + nodeToString(node));
    const torch::jit::Symbol kind = node->kind();

    if (kind == symbols::poptorch::start_for_loop) {
      // Start tracking the new subgraph.
      subgraph_nodes.push(Subgraph());

      torch::jit::WithInsertPoint insert_point(node->next());
      markInputsAsComingFromParent(graph, node->input()->node(),
                                   &subgraph_nodes.top(), false);

      // We no longer need these inputs.
      to_delete.insert(node->input(0)->node());
      node->removeInput(0);

    } else if (kind == symbols::poptorch::end_for_loop) {
      markOutputs(graph, node->input(0)->node(), node, &subgraph_nodes.top());
      subgraph_nodes.pop();

      // We no longer need these inputs.
      to_delete.insert(node->input(0)->node());
      node->removeInput(0);
    } else if (kind == symbols::poptorch::start_if_block) {
      // Start tracking the new subgraph.
      subgraph_nodes.push(Subgraph());

      // if/else block branches code have to get their own context, so that
      // popart outlining does not tear them appart and break their logic.
      insertSetAttribute(graph, subgraph_nodes.size(), node, random_gen);
      // Delete the input node (condition) as it is not needed anymore.
      to_delete.insert(node->input(0)->node());
      node->removeInput(0);
    } else if (kind == symbols::poptorch::start_else_block) {
      // Add the outputs of `then` branch just before starting the else block.
      reshape_putter_helpers_stack.emplace();
      markCondOutputs(graph, node->input(0)->node(), node,
                      &subgraph_nodes.top(), reshape_putter_helpers_stack.top(),
                      false /*processingElseOuputs*/);

      insertClearAttribute(graph, subgraph_nodes.size(), node);

      // Remove the if subgraph.
      subgraph_nodes.pop();

      // Start tracking the new subgraph.
      subgraph_nodes.push(Subgraph());

      insertSetAttribute(graph, subgraph_nodes.size(), node, random_gen,
                         true /* after_insert_pnt */);

      // Delete the input node (then_branch output), as it is not needed
      // anymore.
      to_delete.insert(node->input(0)->node());
      node->removeInput(0);
    } else if (kind == symbols::poptorch::end_if_block) {
      // Mark the outputs of the `else` block.
      markCondOutputs(graph, node->input(0)->node(), node,
                      &subgraph_nodes.top(), reshape_putter_helpers_stack.top(),
                      true /*processingElseOutputs*/);
      reshape_putter_helpers_stack.pop();

      insertClearAttribute(graph, subgraph_nodes.size(), node,
                           true /* after_insert_pnt */);

      // Remove the else subgraph.
      subgraph_nodes.pop();

      // Record the number of outputs.
      const std::size_t num_outputs = node->input(0)->node()->inputs().size();
      node->i_(c10::Symbol::fromQualString("attr::num_outputs"), num_outputs);

      // Delete the 1st input node (else_branch output), as it is not needed
      // anymore.
      to_delete.insert(node->input(0)->node());
      node->removeInput(0);
    } else if (kind == symbols::poptorch::add_untyped_input_tensor) {
      continue;
    } else if (!subgraph_nodes.empty()) {
      // Don't count the list construction nodes.
      if (isUsedInTerminator(node)) {
        continue;
      }

      // Add this node to the active subgraph.
      subgraph_nodes.top().nodes.insert(node);
      torch::jit::WithInsertPoint insert_point(node);
      markInputsAsComingFromParent(graph, node, &subgraph_nodes.top());

      for (const std::pair<torch::jit::Value *const, torch::jit::Value *>
               &pair : subgraph_nodes.top().input_map) {
        node->replaceInputWith(pair.second, pair.first);
      }
    }
  }

  for (torch::jit::Node *node : to_delete) {
    if (node->output()->uses().empty()) {
      node->destroy();
    }
  }
}

} // namespace poptorch


================================================
FILE: poptorch/source/AliasProcessing.cpp
================================================
// Copyright (c) 2021 Graphcore Ltd. All rights reserved.

#include <torch/csrc/jit/ir/ir.h>
#include <vector>

#include "poptorch/AliasProcessing.hpp"

namespace poptorch {

void resolveAliases(torch::jit::Graph *graph) {
  std::vector<torch::jit::Node *> to_delete;

  for (auto *node : graph->nodes()) {
    if (node->kind() != c10::aten::alias) {
      continue;
    }

    // Replace the alias output with the direct input
    node->output()->replaceAllUsesWith(node->input());
    to_delete.push_back(node);
  }

  for (auto *node : to_delete) {
    node->destroy();
  }
}
} // namespace poptorch


================================================
FILE: poptorch/source/CPUOffloadingCleanUp.cpp
================================================
// Copyright (c) 2021 Graphcore Ltd. All rights reserved.

#include <torch/csrc/jit/ir/ir.h>

#include "PoptorchSymbols.hpp"
#include "poptorch/OpBuilder.hpp"
#include "poptorch/PopartCanonicalization.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Logging.hpp"

/*
 * CPU operations come in from the host in the form:

    Tensor[] %outputs = poptorch::call_cpu_op(%inputs)

    ... # Any traced user operations (to keep the trace consistent/happy)

    = poptorch::end_cpu_op(%8)

  * We need to do two things. Firstly we need to cull all the ops inbetween
  * call_cpu_op and end_cpu_op. Secondly we need to map the users of
  * poptorch::end_cpu_op to the outputs of poptorch::call_cpu_op.
  *
  * To do this we simply traverse through the nodes and record when we enter and
  * exit CPU op scope i.e between poptorch::call_cpu_op a poptorch::end_cpu_op.
*/

// extern c10::Symbol call_cpu_op;

namespace poptorch {

void cpuOffloadingCleanup(torch::jit::Graph *graph) {
  std::unordered_set<torch::jit::Node *> to_delete;

  // For diagnostics.
  std::size_t cpu_ops_found = 0;

  // The CPU op we are currently working on.
  torch::jit::Node *cpu_op_in_scope = nullptr;

  // For all nodes in the IR.
  for (torch::jit::Node *node : graph->nodes()) {
    const torch::jit::Symbol kind = node->kind();

    // Start CPU op scope.
    if (kind == symbols::poptorch::call_cpu_op) {
      ERROR_ON_MSG(
          cpu_op_in_scope != nullptr,
          "Trying to enter CPU from another CPU op! CPU ops must not overlap.");
      cpu_ops_found++;
      cpu_op_in_scope = node;
    } else if (kind == symbols::poptorch::canonicalised_cpu_call) {
      ERROR_ON_MSG(
          cpu_op_in_scope != nullptr,
          "Trying to enter CPU from another CPU op! CPU ops must not overlap.");
      cpu_ops_found++;
      cpu_op_in_scope = node;
    } else if (kind == symbols::poptorch::end_cpu_op) {
      to_delete.insert(node);

      // The form should be that the `end_cpu_op` feeds into a `ListUnpack` node
      // which converts the single output of the `end_cpu_op` (representing a
      // tuple/list) into multiple outputs. We transform it to eliminate that
      // unpack.
      torch::jit::Value *output = node->output();

      std::vector<torch::jit::Use> uses = output->uses();

      ERROR_ON_MSG(
          uses.empty(),
          "[Internal compiler error] CPU operation output has no uses.");
      ERROR_ON_MSG(
          uses.size() > 1,
          "[Internal compiler error] CPU operation output has multiple uses.");

      // List unpack
      torch::jit::Node *unpack = uses[0].user;
      ERROR_ON_MSG(unpack->kind() != c10::prim::ListUnpack,
                   "[Internal compiler error] CPU operation output is not used "
                   "by a list unpack");

      unpack->removeAllInputs();

      ERROR_ON_MSG(cpu_op_in_scope == nullptr,
                   "[Internal compiler error] CPU operation is null");

      // Remove the output.
      // Add the outputs and remap them to point to what the unpack previously
      // was used in.
      for (torch::jit::Value *old_out : unpack->outputs()) {
        torch::jit::Value *new_out = cpu_op_in_scope->addOutput();

        new_out->copyMetadata(old_out);
        old_out->replaceAllUsesWith(new_out);
      }

      // Remove the unpack.
      to_delete.insert(unpack);

      // Leave CPU scope.
      cpu_op_in_scope = nullptr;

    } else if (cpu_op_in_scope != nullptr) {
      // Unfortunately the compiler can put some non-functional SSA unpack ops
      // in the CPU scope that do logically outlive it.
      if (node->kind() != c10::prim::ListUnpack) {
        // Enables us to clean up some nodes without invalidating the IR.
        node->removeAllInputs();

        // Record the op for removal.
        to_delete.insert(node);
      }
    }
  }

  logging::trace("Found {} cpu ops. Removed {} nodes", cpu_ops_found,
                 to_delete.size());

  // Remove the dead nodes.
  searchAndPossiblyDestroy(to_delete);
}

} // namespace poptorch


================================================
FILE: poptorch/source/CompilerOps.cpp.inc
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
// Auto generated file, do not modify
// Run `python3 scripts/PopParse.py` to regenerate
// clang-format off

torch::jit::Node* createCopyvarupdate(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::copyvarupdate, args, ImplicitCast::None, OutputType::Unknown);
return new_node;
}

torch::jit::Node* createBucketize(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args, bool right) {
  torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::bucketize, args, ImplicitCast::None, OutputType::AsFirstInput);
  new_node->i_(c10::Symbol::attr("right"), static_cast<std::int32_t>(right));
  return new_node;
}

torch::jit::Node* createBatchnormalization(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,unsigned int num_outputs,float epsilon,float momentum, unsigned int num_node_outputs) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::batchnormalization, args, ImplicitCast::None, OutputType::AsFirstInput, num_node_outputs);
new_node->i_(c10::Symbol::attr("num_outputs"),num_outputs);
new_node->f_(c10::Symbol::attr("epsilon"),epsilon);
new_node->f_(c10::Symbol::attr("momentum"),momentum);
return new_node;
}
torch::jit::Node* createGroupnormalization(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t num_groups,float epsilon) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::groupnormalization, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->i_(c10::Symbol::attr("num_groups"),num_groups);
new_node->f_(c10::Symbol::attr("epsilon"),epsilon);
return new_node;
}
torch::jit::Node* createSubsample(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::vector<int64_t> & strides) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::subsample, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->is_(c10::Symbol::attr("strides"),strides);
return new_node;
}
torch::jit::Node* createPrinttensor(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t print_gradient,const std::string & title,const int summariseThreshold,const int edgeItems,const int maxLineWidth,const int digits,const int floatFormat,const char separator,const char openBracket,const char closeBracket) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::printtensor, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->i_(c10::Symbol::attr("print_gradient"),print_gradient);
new_node->s_(c10::Symbol::attr("title"),title);
new_node->i_(c10::Symbol::attr("summariseThreshold"),summariseThreshold);
new_node->i_(c10::Symbol::attr("edgeItems"),edgeItems);
new_node->i_(c10::Symbol::attr("maxLineWidth"),maxLineWidth);
new_node->i_(c10::Symbol::attr("digits"),digits);
new_node->i_(c10::Symbol::attr("floatFormat"),floatFormat);
new_node->i_(c10::Symbol::attr("separator"),separator);
new_node->i_(c10::Symbol::attr("openBracket"),openBracket);
new_node->i_(c10::Symbol::attr("closeBracket"),closeBracket);
return new_node;
}
torch::jit::Node* createNop(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::nop, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createScale(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,float scale) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::scale, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->f_(c10::Symbol::attr("scale"),scale);
return new_node;
}
torch::jit::Node* createScaledadd(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,float scale0,float scale1) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::scaledadd, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->f_(c10::Symbol::attr("scale0"),scale0);
new_node->f_(c10::Symbol::attr("scale1"),scale1);
return new_node;
}
torch::jit::Node* createLstm(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t outputFullSequence) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::lstm, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted, 2);
new_node->i_(c10::Symbol::attr("outputFullSequence"),outputFullSequence);
return new_node;
}
torch::jit::Node* createGelu(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::gelu, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createGeluErf(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::geluerf, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createDetach(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::detach, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createDepthtospace(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t blocksize,const std::string & mode) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::depthtospace, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->i_(c10::Symbol::attr("blocksize"),blocksize);
new_node->s_(c10::Symbol::attr("mode"),mode);
return new_node;
}
torch::jit::Node* createRound(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::round, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createNearbyInt(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::nearbyint, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createDynamicslice(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::vector<int64_t> axes,std::vector<int64_t> sizes,std::int32_t noOverlap) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::dynamicslice, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->is_(c10::Symbol::attr("axes"),axes);
new_node->is_(c10::Symbol::attr("sizes"),sizes);
new_node->i_(c10::Symbol::attr("noOverlap"),noOverlap);
return new_node;
}
torch::jit::Node* createDynamicupdate(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::vector<int64_t> axes,std::vector<int64_t> sizes,std::int32_t noOverlap) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::dynamicupdate, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->is_(c10::Symbol::attr("axes"),axes);
new_node->is_(c10::Symbol::attr("sizes"),sizes);
new_node->i_(c10::Symbol::attr("noOverlap"),noOverlap);
return new_node;
}
torch::jit::Node* createDynamiczero(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::vector<int64_t> axes,std::vector<int64_t> sizes) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::dynamiczero, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted);
new_node->is_(c10::Symbol::attr("axes"),axes);
new_node->is_(c10::Symbol::attr("sizes"),sizes);
return new_node;
}
torch::jit::Node* createDynamicadd(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::vector<int64_t> axes,std::vector<int64_t> sizes) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::dynamicadd, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted);
new_node->is_(c10::Symbol::attr("axes"),axes);
new_node->is_(c10::Symbol::attr("sizes"),sizes);
return new_node;
}
torch::jit::Node* createSequenceslice(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::int32_t zeroUnused) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::sequenceslice, args, ImplicitCast::None, OutputType::Unknown);
new_node->i_(c10::Symbol::attr("zeroUnused"),zeroUnused);
return new_node;
}
torch::jit::Node* createL1loss(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const float lambda,std::int32_t reduction) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::l1loss, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->f_(c10::Symbol::attr("lambda"),lambda);
new_node->i_(c10::Symbol::attr("reduction"),reduction);
return new_node;
}
torch::jit::Node* createNllloss(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::int32_t reduction,std::int32_t ignoreIndex,bool inputIsLogProbability) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::nllloss, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->i_(c10::Symbol::attr("reduction"),reduction);
new_node->i_(c10::Symbol::attr("ignoreIndex"),ignoreIndex);
new_node->i_(c10::Symbol::attr("inputIsLogProbability"),inputIsLogProbability);
return new_node;
}
torch::jit::Node* createIdentityloss(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::int32_t reduction) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::identityloss, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->i_(c10::Symbol::attr("reduction"),reduction);
return new_node;
}
torch::jit::Node* create_ctcloss(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::int32_t reduction,const unsigned int blank,const std::string & outDataType,const bool zeroInfinity) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::_ctcloss, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->i_(c10::Symbol::attr("reduction"),reduction);
new_node->i_(c10::Symbol::attr("blank"),blank);
new_node->s_(c10::Symbol::attr("outDataType"),outDataType);
new_node->i_(c10::Symbol::attr("zeroInfinity"),zeroInfinity);
return new_node;
}
torch::jit::Node* createCtcbeamsearchdecoder(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,unsigned int blank,unsigned int beamWidth,unsigned int topPaths) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::ctcbeamsearchdecoder, args, ImplicitCast::None, OutputType::Unknown);
new_node->i_(c10::Symbol::attr("blank"),blank);
new_node->i_(c10::Symbol::attr("beamWidth"),beamWidth);
new_node->i_(c10::Symbol::attr("topPaths"),topPaths);
return new_node;
}
torch::jit::Node* createShapeddropout(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::vector<int64_t> & shape,float ratio) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::shapeddropout, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->is_(c10::Symbol::attr("shape"),shape);
new_node->f_(c10::Symbol::attr("ratio"),ratio);
return new_node;
}
torch::jit::Node* createAtan2(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::atan2, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted);
return new_node;
}
torch::jit::Node* createExpm1(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::expm1, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createLog1p(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::log1p, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createFmod(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::fmod, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted);
return new_node;
}
torch::jit::Node* createRemainder(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::remainder, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted);
return new_node;
}
torch::jit::Node* createReverse(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::vector<int64_t> & dimensions) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::reverse, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->is_(c10::Symbol::attr("dimensions"),dimensions);
return new_node;
}
torch::jit::Node* createSlice(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::vector<int64_t> & ends,const std::vector<int64_t> & starts,const std::vector<int64_t> & axes) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::slice, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->is_(c10::Symbol::attr("ends"),ends);
new_node->is_(c10::Symbol::attr("starts"),starts);
new_node->is_(c10::Symbol::attr("axes"),axes);
return new_node;
}
torch::jit::Node* createBitwisenot(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::bitwisenot, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createBitwiseand(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::bitwiseand, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createBitwiseor(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::bitwiseor, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createBitwisexor(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::bitwisexor, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createBitwisexnor(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::bitwisexnor, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createReducemedian(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::vector<int64_t> axes,int64_t keepdims) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::reducemedian, args, ImplicitCast::None, OutputType::FirstAsFirstInputSecondAlwaysInt, 2);
new_node->is_(c10::Symbol::attr("axes"),axes);
new_node->i_(c10::Symbol::attr("keepdims"),keepdims);
return new_node;
}
torch::jit::Node* createScatterreduce(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::int32_t axis_size,std::int32_t axis,bool enable_index_broadcast,std::int32_t reduction) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::scatterreduce, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->i_(c10::Symbol::attr("axis_size"),axis_size);
new_node->i_(c10::Symbol::attr("axis"),axis);
new_node->i_(c10::Symbol::attr("reduction"),reduction);
new_node->i_(c10::Symbol::attr("enable_index_broadcast"), static_cast<std::int32_t>(enable_index_broadcast));
return new_node;
}
torch::jit::Node* createGroupedscatterreduce(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::int32_t axis_size,std::int32_t axis,std::int32_t group_size,bool enable_index_broadcast,std::int32_t reduction) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::groupedscatterreduce, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->i_(c10::Symbol::attr("axis_size"),axis_size);
new_node->i_(c10::Symbol::attr("axis"),axis);
new_node->i_(c10::Symbol::attr("reduction"),reduction);
new_node->i_(c10::Symbol::attr("group_size"), group_size);
new_node->i_(c10::Symbol::attr("enable_index_broadcast"), static_cast<std::int32_t>(enable_index_broadcast));
return new_node;
}
torch::jit::Node* createSwish(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::swish, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createAveragepool(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::vector<int64_t> & kernel_shape,int64_t ceil_mode,int64_t count_include_pad,const std::vector<int64_t> & pads,const std::vector<int64_t> & strides) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::averagepool, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->is_(c10::Symbol::attr("kernel_shape"),kernel_shape);
new_node->i_(c10::Symbol::attr("ceil_mode"),ceil_mode);
new_node->i_(c10::Symbol::attr("count_include_pad"),count_include_pad);
new_node->is_(c10::Symbol::attr("pads"),pads);
new_node->is_(c10::Symbol::attr("strides"),strides);
return new_node;
}
torch::jit::Node* createConvinteger(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::vector<int64_t> & dilations,int64_t group,const std::vector<int64_t> & kernel_shape,const std::vector<int64_t> & pads,const std::vector<int64_t> & strides) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::convinteger, args, ImplicitCast::None, OutputType::AlwaysInt);
new_node->is_(c10::Symbol::attr("dilations"),dilations);
new_node->i_(c10::Symbol::attr("group"),group);
new_node->is_(c10::Symbol::attr("kernel_shape"),kernel_shape);
new_node->is_(c10::Symbol::attr("pads"),pads);
new_node->is_(c10::Symbol::attr("strides"),strides);
return new_node;
}
torch::jit::Node* createDequantizelinear(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::dequantizelinear, args, ImplicitCast::ExceptSecond, OutputType::AlwaysFloat);
return new_node;
}
torch::jit::Node* createDropout(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,unsigned int num_outputs,float ratio) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::dropout, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->i_(c10::Symbol::attr("num_outputs"),num_outputs);
new_node->f_(c10::Symbol::attr("ratio"),ratio);
return new_node;
}
torch::jit::Node* createIsinf(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t detect_negative,int64_t detect_positive) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::isinf, args, ImplicitCast::None, OutputType::AlwaysBool);
new_node->i_(c10::Symbol::attr("detect_negative"),detect_negative);
new_node->i_(c10::Symbol::attr("detect_positive"),detect_positive);
return new_node;
}
torch::jit::Node* createMatmulinteger(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::matmulinteger, args, ImplicitCast::None, OutputType::AlwaysInt);
return new_node;
}
torch::jit::Node* createMaxpool(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,unsigned int num_outputs,const std::vector<int64_t> & kernel_shape,int64_t ceil_mode,const std::vector<int64_t> & dilations,const std::vector<int64_t> & pads,int64_t storage_order,const std::vector<int64_t> & strides) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::maxpool, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->i_(c10::Symbol::attr("num_outputs"),num_outputs);
new_node->is_(c10::Symbol::attr("kernel_shape"),kernel_shape);
new_node->i_(c10::Symbol::attr("ceil_mode"),ceil_mode);
new_node->is_(c10::Symbol::attr("dilations"),dilations);
new_node->is_(c10::Symbol::attr("pads"),pads);
new_node->i_(c10::Symbol::attr("storage_order"),storage_order);
new_node->is_(c10::Symbol::attr("strides"),strides);
return new_node;
}
torch::jit::Node* createMod(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t fmod) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::mod, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted);
new_node->i_(c10::Symbol::attr("fmod"),fmod);
return new_node;
}
torch::jit::Node* createNonmaxsuppression(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t center_point_box) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::nonmaxsuppression, args, ImplicitCast::None, OutputType::AlwaysInt);
new_node->i_(c10::Symbol::attr("center_point_box"),center_point_box);
return new_node;
}
torch::jit::Node* createQlinearconv(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::vector<int64_t> & dilations,int64_t group,const std::vector<int64_t> & kernel_shape,const std::vector<int64_t> & pads,const std::vector<int64_t> & strides) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::qlinearconv, args, ImplicitCast::None, OutputType::AlwaysUint8);
new_node->is_(c10::Symbol::attr("dilations"),dilations);
new_node->i_(c10::Symbol::attr("group"),group);
new_node->is_(c10::Symbol::attr("kernel_shape"),kernel_shape);
new_node->is_(c10::Symbol::attr("pads"),pads);
new_node->is_(c10::Symbol::attr("strides"),strides);
return new_node;
}
torch::jit::Node* createQlinearmatmul(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::qlinearmatmul, args, ImplicitCast::None, OutputType::AlwaysUint8);
return new_node;
}
torch::jit::Node* createQuantizelinear(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::quantizelinear, args, ImplicitCast::None, OutputType::AlwaysUint8);
return new_node;
}
torch::jit::Node* createResize(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::string &coordinate_transformation_mode,float cubic_coeff_a,int64_t exclude_outside,float extrapolation_value,const std::string & mode,const std::string &nearest_mode) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::resize, args, ImplicitCast::None, OutputType::AsFirstInput);

new_node->s_(c10::Symbol::attr("coordinate_transformation_mode"),coordinate_transformation_mode);
new_node->f_(c10::Symbol::attr("cubic_coeff_a"),cubic_coeff_a);
new_node->i_(c10::Symbol::attr("exclude_outside"),exclude_outside);
new_node->f_(c10::Symbol::attr("extrapolation_value"),extrapolation_value);
new_node->s_(c10::Symbol::attr("mode"),mode);
new_node->s_(c10::Symbol::attr("nearest_mode"),nearest_mode);
return new_node;
}
torch::jit::Node* createReversesequence(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t batch_axis,int64_t time_axis) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::reversesequence, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->i_(c10::Symbol::attr("batch_axis"),batch_axis);
new_node->i_(c10::Symbol::attr("time_axis"),time_axis);
return new_node;
}
torch::jit::Node* createRoialign(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::string & mode,int64_t output_height,int64_t output_width,int64_t sampling_ratio,float spatial_scale) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::roialign, args, ImplicitCast::ExceptThird, OutputType::AsImplicitCastPromoted);
new_node->s_(c10::Symbol::attr("mode"),mode);
new_node->i_(c10::Symbol::attr("output_height"),output_height);
new_node->i_(c10::Symbol::attr("output_width"),output_width);
new_node->i_(c10::Symbol::attr("sampling_ratio"),sampling_ratio);
new_node->f_(c10::Symbol::attr("spatial_scale"),spatial_scale);
return new_node;
}
torch::jit::Node* createThresholdedrelu(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,float alpha) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::thresholdedrelu, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->f_(c10::Symbol::attr("alpha"),alpha);
return new_node;
}
torch::jit::Node* createTopk(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t axis, bool largest, bool sorted) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::topk, args, ImplicitCast::None, OutputType::FirstAsFirstInputSecondAlwaysInt, 2);
new_node->i_(c10::Symbol::attr("axis"),axis);
new_node->i_(c10::Symbol::attr("largest"),static_cast<int32_t>(largest));
new_node->i_(c10::Symbol::attr("sorted"),static_cast<int32_t>(sorted));

return new_node;
}

torch::jit::Node* createSort(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t axis, bool descending, bool stable) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::sort, args, ImplicitCast::None, OutputType::FirstAsFirstInputSecondAlwaysInt, 2);
new_node->i_(c10::Symbol::attr("axis"),axis);
new_node->i_(c10::Symbol::attr("descending"),static_cast<int32_t>(descending));
new_node->i_(c10::Symbol::attr("stable"),static_cast<int32_t>(stable));

return new_node;
}
torch::jit::Node* createUpsample(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::string & mode) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::upsample, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->s_(c10::Symbol::attr("mode"),mode);
return new_node;
}
torch::jit::Node* createAcosh(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::acosh, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createAsinh(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::asinh, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createAtanh(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::atanh, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createCast(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::string & to) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::cast, args, ImplicitCast::None, OutputType::AsDtype);
new_node->s_(c10::Symbol::attr("to"),to);
setNodeOutputsTypes(new_node, ImplicitCast::None, OutputType::AsDtype);
return new_node;
}
torch::jit::Node* createCompress(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::int32_t axis) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::compress, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->i_(c10::Symbol::attr("axis"),axis);
return new_node;
}
torch::jit::Node* createCosh(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::cosh, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createErf(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::erf, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createEyelike(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::int32_t dtype,int64_t k) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::eyelike, args, ImplicitCast::None, OutputType::AsDtype);
new_node->i_(c10::Symbol::attr("dtype"),dtype);
new_node->i_(c10::Symbol::attr("k"),k);
setNodeOutputsTypes(new_node, ImplicitCast::None, OutputType::AsDtype);
return new_node;
}
torch::jit::Node* createFlatten(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t axis) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::flatten, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->i_(c10::Symbol::attr("axis"),axis);
return new_node;
}
torch::jit::Node* createGemm(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,float alpha,float beta,int64_t transA,int64_t transB) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::gemm, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted);
new_node->f_(c10::Symbol::attr("alpha"),alpha);
new_node->f_(c10::Symbol::attr("beta"),beta);
new_node->i_(c10::Symbol::attr("transA"),transA);
new_node->i_(c10::Symbol::attr("transB"),transB);
return new_node;
}
torch::jit::Node* createGreater(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::greater, args, ImplicitCast::All, OutputType::AlwaysBool);
return new_node;
}
torch::jit::Node* createIsnan(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::isnan, args, ImplicitCast::None, OutputType::AlwaysBool);
return new_node;
}
torch::jit::Node* createLess(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::less, args, ImplicitCast::All, OutputType::AlwaysBool);
return new_node;
}
torch::jit::Node* createMatmul(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::matmul, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted);
return new_node;
}
torch::jit::Node* createMaxunpool(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::vector<int64_t> & kernel_shape,const std::vector<int64_t> & pads,const std::vector<int64_t> & strides) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::maxunpool, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->is_(c10::Symbol::attr("kernel_shape"),kernel_shape);
new_node->is_(c10::Symbol::attr("pads"),pads);
new_node->is_(c10::Symbol::attr("strides"),strides);
return new_node;
}
torch::jit::Node* createMeanvariancenormalization(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::vector<int64_t> & axes) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::meanvariancenormalization, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->is_(c10::Symbol::attr("axes"),axes);
return new_node;
}
torch::jit::Node* createNonzero(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::nonzero, args, ImplicitCast::None, OutputType::AlwaysInt);
return new_node;
}
torch::jit::Node* createOnehot(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t axis) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::onehot, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->i_(c10::Symbol::attr("axis"),axis);
return new_node;
}
torch::jit::Node* createScatter(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t axis) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::scatter, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->i_(c10::Symbol::attr("axis"),axis);
return new_node;
}
torch::jit::Node* createScatterElements(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t axis) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::scatterelements, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->i_(c10::Symbol::attr("axis"),axis);
return new_node;
}
torch::jit::Node* createShrink(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,float bias,float lambd) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::shrink, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->f_(c10::Symbol::attr("bias"),bias);
new_node->f_(c10::Symbol::attr("lambd"),lambd);
return new_node;
}
torch::jit::Node* createSign(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::sign, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createSinh(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::sinh, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createTfidfvectorizer(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t max_gram_length,int64_t max_skip_count,int64_t min_gram_length,const std::string & mode,const std::vector<int64_t> & ngram_counts,const std::vector<int64_t> & ngram_indexes,const std::vector<int64_t> & pool_int64s,const std::vector<std::string> & pool_strings,std::vector<double> weights) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::tfidfvectorizer, args, ImplicitCast::None, OutputType::AlwaysFloat);
new_node->i_(c10::Symbol::attr("max_gram_length"),max_gram_length);
new_node->i_(c10::Symbol::attr("max_skip_count"),max_skip_count);
new_node->i_(c10::Symbol::attr("min_gram_length"),min_gram_length);
new_node->s_(c10::Symbol::attr("mode"),mode);
new_node->is_(c10::Symbol::attr("ngram_counts"),ngram_counts);
new_node->is_(c10::Symbol::attr("ngram_indexes"),ngram_indexes);
new_node->is_(c10::Symbol::attr("pool_int64s"),pool_int64s);
new_node->ss_(c10::Symbol::attr("pool_strings"),pool_strings);
new_node->fs_(c10::Symbol::attr("weights"),weights);
return new_node;
}
torch::jit::Node* createWhere(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::where, args, ImplicitCast::ExceptFirst, OutputType::AsImplicitCastPromoted);
return new_node;
}
torch::jit::Node* createExpand(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::expand, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createMax(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::max, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted);
return new_node;
}
torch::jit::Node* createMean(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::mean, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted);
return new_node;
}
torch::jit::Node* createMin(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::min, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted);
return new_node;
}
torch::jit::Node* createSum(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::sum, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted);
return new_node;
}
torch::jit::Node* createAcos(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::acos, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createAdd(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::add, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted);
return new_node;
}
torch::jit::Node* createLogical_and(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::logical_and, args, ImplicitCast::All, OutputType::AlwaysBool);
return new_node;
}
torch::jit::Node* createAsin(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::asin, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createAtan(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::atan, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createCos(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::cos, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createDiv(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::div, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted);
return new_node;
}
torch::jit::Node* createEqual(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::equal, args, ImplicitCast::All, OutputType::AlwaysBool);
return new_node;
}
torch::jit::Node* createMul(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::mul, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted);
return new_node;
}
torch::jit::Node* createMultinomial(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t dtype,int64_t sample_size,float seed) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::multinomial, args, ImplicitCast::None, OutputType::AsDtype);
new_node->i_(c10::Symbol::attr("dtype"),dtype);
new_node->i_(c10::Symbol::attr("sample_size"),sample_size);
new_node->f_(c10::Symbol::attr("seed"),seed);
setNodeOutputsTypes(new_node, ImplicitCast::None, OutputType::AsDtype);
return new_node;
}
torch::jit::Node* createLogical_or(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::logical_or, args, ImplicitCast::All, OutputType::AlwaysBool);
return new_node;
}
torch::jit::Node* createPow(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::pow, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted);
return new_node;
}
torch::jit::Node* createSin(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::sin, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createSub(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::sub, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted);
return new_node;
}
torch::jit::Node* createTan(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::tan, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createLogical_xor(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::logical_xor, args, ImplicitCast::All, OutputType::AlwaysBool);
return new_node;
}
torch::jit::Node* createAbs(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::abs, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createArgmax(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t axis,int64_t keepdims) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::argmax, args, ImplicitCast::None, OutputType::AlwaysInt);
new_node->i_(c10::Symbol::attr("axis"),axis);
new_node->i_(c10::Symbol::attr("keepdims"),keepdims);
return new_node;
}
torch::jit::Node* createArgmin(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t axis,int64_t keepdims) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::argmin, args, ImplicitCast::None, OutputType::AlwaysInt);
new_node->i_(c10::Symbol::attr("axis"),axis);
new_node->i_(c10::Symbol::attr("keepdims"),keepdims);
return new_node;
}
torch::jit::Node* createCeil(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::ceil, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createClip(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::clip, args, ImplicitCast::None, OutputType::AsImplicitCastPromoted);
 return new_node;
}
torch::jit::Node* createConcat(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t axis) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::concat, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted);
new_node->i_(c10::Symbol::attr("axis"),axis);
return new_node;
}
torch::jit::Node* createConv(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::vector<int64_t> & dilations,int64_t group,const std::vector<int64_t> & kernel_shape,const std::vector<int64_t> & pads,const std::vector<int64_t> & strides) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::conv, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->is_(c10::Symbol::attr("dilations"),dilations);
new_node->i_(c10::Symbol::attr("group"),group);
new_node->is_(c10::Symbol::attr("kernel_shape"),kernel_shape);
new_node->is_(c10::Symbol::attr("pads"),pads);
new_node->is_(c10::Symbol::attr("strides"),strides);
return new_node;
}
torch::jit::Node* createConvtranspose(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::vector<int64_t> & dilations,int64_t group,const std::vector<int64_t> & kernel_shape,const std::vector<int64_t> & output_padding,const std::vector<int64_t> & output_shape,const std::vector<int64_t> & pads,const std::vector<int64_t> & strides) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::convtranspose, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted);
new_node->is_(c10::Symbol::attr("dilations"),dilations);
new_node->i_(c10::Symbol::attr("group"),group);
new_node->is_(c10::Symbol::attr("kernel_shape"),kernel_shape);
new_node->is_(c10::Symbol::attr("output_padding"),output_padding);
new_node->is_(c10::Symbol::attr("output_shape"),output_shape);
new_node->is_(c10::Symbol::attr("pads"),pads);
new_node->is_(c10::Symbol::attr("strides"),strides);
return new_node;
}
torch::jit::Node* createElu(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,float alpha) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::elu, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->f_(c10::Symbol::attr("alpha"),alpha);
return new_node;
}
torch::jit::Node* createExp(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::exp, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createFloor(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::floor, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createGather(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t axis) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::gather, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->i_(c10::Symbol::attr("axis"),axis);
return new_node;
}
torch::jit::Node* createGroupedgather(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t axis,int64_t group_size) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::groupedgather, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->i_(c10::Symbol::attr("axis"),axis);
new_node->i_(c10::Symbol::attr("group_size"),group_size);
return new_node;
}
torch::jit::Node* createGlobalaveragepool(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::globalaveragepool, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createGloballppool(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t p) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::globallppool, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->i_(c10::Symbol::attr("p"),p);
return new_node;
}
torch::jit::Node* createGlobalmaxpool(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::globalmaxpool, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createHardsigmoid(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,float alpha,float beta) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::hardsigmoid, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->f_(c10::Symbol::attr("alpha"),alpha);
new_node->f_(c10::Symbol::attr("beta"),beta);
return new_node;
}
torch::jit::Node* createHardmax(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t axis) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::hardmax, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->i_(c10::Symbol::attr("axis"),axis);
return new_node;
}
torch::jit::Node* createIdentity(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::identity, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createInstancenormalization(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,float epsilon) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::instancenormalization, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted);
new_node->f_(c10::Symbol::attr("epsilon"),epsilon);
return new_node;
}
torch::jit::Node* createLrn(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t size,float alpha,float beta,float bias) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::lrn, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->i_(c10::Symbol::attr("size"),size);
new_node->f_(c10::Symbol::attr("alpha"),alpha);
new_node->f_(c10::Symbol::attr("beta"),beta);
new_node->f_(c10::Symbol::attr("bias"),bias);
return new_node;
}
torch::jit::Node* createLeakyrelu(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,float alpha) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::leakyrelu, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->f_(c10::Symbol::attr("alpha"),alpha);
return new_node;
}
torch::jit::Node* createLog(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::log, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createLogsoftmax(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t axis) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::logsoftmax, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->i_(c10::Symbol::attr("axis"),axis);
return new_node;
}
torch::jit::Node* createLpnormalization(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t axis,int64_t p) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::lpnormalization, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->i_(c10::Symbol::attr("axis"),axis);
new_node->i_(c10::Symbol::attr("p"),p);
return new_node;
}
torch::jit::Node* createLppool(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::vector<int64_t> & kernel_shape,int64_t p,const std::vector<int64_t> & pads,const std::vector<int64_t> & strides) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::lppool, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->is_(c10::Symbol::attr("kernel_shape"),kernel_shape);
new_node->i_(c10::Symbol::attr("p"),p);
new_node->is_(c10::Symbol::attr("pads"),pads);
new_node->is_(c10::Symbol::attr("strides"),strides);
return new_node;
}
torch::jit::Node* createMaxroipool(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::vector<int64_t> & pooled_shape,float spatial_scale) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::maxroipool, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted);
new_node->is_(c10::Symbol::attr("pooled_shape"),pooled_shape);
new_node->f_(c10::Symbol::attr("spatial_scale"),spatial_scale);
return new_node;
}
torch::jit::Node* createNeg(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::neg, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createLogical_not(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::logical_not, args, ImplicitCast::None, OutputType::AlwaysBool);
return new_node;
}
torch::jit::Node* createPad(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::string & mode) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::pad, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->s_(c10::Symbol::attr("mode"),mode);
return new_node;
}
torch::jit::Node* createRandomnormallike(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::int32_t dtype,float mean,float scale,float seed) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::randomnormallike, args, ImplicitCast::None, OutputType::AsDtypeOrAsPromoted);
new_node->i_(c10::Symbol::attr("dtype"),dtype);
new_node->f_(c10::Symbol::attr("mean"),mean);
new_node->f_(c10::Symbol::attr("scale"),scale);
new_node->f_(c10::Symbol::attr("seed"),seed);
setNodeOutputsTypes(new_node, ImplicitCast::All, OutputType::AsDtypeOrAsPromoted);
return new_node;
}
torch::jit::Node* createRandomuniformlike(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::int32_t dtype,float high,float low,float seed) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::randomuniformlike, args, ImplicitCast::None, OutputType::AsDtypeOrAsPromoted);
new_node->i_(c10::Symbol::attr("dtype"),dtype);
new_node->f_(c10::Symbol::attr("high"),high);
new_node->f_(c10::Symbol::attr("low"),low);
new_node->f_(c10::Symbol::attr("seed"),seed);
setNodeOutputsTypes(new_node, ImplicitCast::All, OutputType::AsDtypeOrAsPromoted);
return new_node;
}
torch::jit::Node* createReciprocal(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::reciprocal, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createReducel1(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::vector<int64_t> axes,int64_t keepdims) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::reducel1, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->is_(c10::Symbol::attr("axes"),axes);
new_node->i_(c10::Symbol::attr("keepdims"),keepdims);
return new_node;
}
torch::jit::Node* createReducel2(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::vector<int64_t> axes,int64_t keepdims) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::reducel2, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->is_(c10::Symbol::attr("axes"),axes);
new_node->i_(c10::Symbol::attr("keepdims"),keepdims);
return new_node;
}
torch::jit::Node* createReducelogsum(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::vector<int64_t> axes,int64_t keepdims) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::reducelogsum, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->is_(c10::Symbol::attr("axes"),axes);
new_node->i_(c10::Symbol::attr("keepdims"),keepdims);
return new_node;
}
torch::jit::Node* createReducelogsumexp(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::vector<int64_t> axes,int64_t keepdims) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::reducelogsumexp, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->is_(c10::Symbol::attr("axes"),axes);
new_node->i_(c10::Symbol::attr("keepdims"),keepdims);
return new_node;
}
torch::jit::Node* createReducemax(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::vector<int64_t> axes,int64_t keepdims) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::reducemax, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->is_(c10::Symbol::attr("axes"),axes);
new_node->i_(c10::Symbol::attr("keepdims"),keepdims);
return new_node;
}
torch::jit::Node* createReducemean(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::vector<int64_t> axes,int64_t keepdims) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::reducemean, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->is_(c10::Symbol::attr("axes"),axes);
new_node->i_(c10::Symbol::attr("keepdims"),keepdims);
return new_node;
}
torch::jit::Node* createReducemin(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::vector<int64_t> axes,int64_t keepdims) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::reducemin, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->is_(c10::Symbol::attr("axes"),axes);
new_node->i_(c10::Symbol::attr("keepdims"),keepdims);
return new_node;
}
torch::jit::Node* createReduceprod(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::vector<int64_t> axes,int64_t keepdims) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::reduceprod, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->is_(c10::Symbol::attr("axes"),axes);
new_node->i_(c10::Symbol::attr("keepdims"),keepdims);
return new_node;
}
torch::jit::Node* createReducesum(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::vector<int64_t> axes,int64_t keepdims) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::reducesum, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->is_(c10::Symbol::attr("axes"),axes);
new_node->i_(c10::Symbol::attr("keepdims"),keepdims);
return new_node;
}
torch::jit::Node* createReducesumsquare(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::vector<int64_t> axes,int64_t keepdims) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::reducesumsquare, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->is_(c10::Symbol::attr("axes"),axes);
new_node->i_(c10::Symbol::attr("keepdims"),keepdims);
return new_node;
}
torch::jit::Node* createRelu(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::relu, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createSelu(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,float alpha,float gamma) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::selu, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->f_(c10::Symbol::attr("alpha"),alpha);
new_node->f_(c10::Symbol::attr("gamma"),gamma);
return new_node;
}
torch::jit::Node* createShape(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::shape, args, ImplicitCast::None, OutputType::AlwaysInt);
return new_node;
}
torch::jit::Node* createSigmoid(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::sigmoid, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createSize(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::size, args, ImplicitCast::None, OutputType::AlwaysInt);
return new_node;
}
torch::jit::Node* createSoftmax(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t axis) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::softmax, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->i_(c10::Symbol::attr("axis"),axis);
return new_node;
}
torch::jit::Node* createSoftplus(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::softplus, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createSoftsign(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::softsign, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createSpacetodepth(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t blocksize) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::spacetodepth, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->i_(c10::Symbol::attr("blocksize"),blocksize);
return new_node;
}
torch::jit::Node* createSplinebasis(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::int32_t degree) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::splinebasis, args, ImplicitCast::None, OutputType::FirstAsFirstInputSecondAlwaysInt, 2);
new_node->i_(c10::Symbol::attr("degree"),degree);
return new_node;
}
torch::jit::Node* createSplineweighting(torch::jit::Graph *graph, const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::splineweighting, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createSplit(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,unsigned int num_outputs,int64_t axis,const std::vector<int64_t> & split) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::split, args, ImplicitCast::None, OutputType::AsFirstInput, num_outputs);
new_node->i_(c10::Symbol::attr("num_outputs"),num_outputs);
new_node->i_(c10::Symbol::attr("axis"),axis);
new_node->is_(c10::Symbol::attr("split"),split);
return new_node;
}
torch::jit::Node* createSqrt(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::sqrt, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createSqueeze(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::vector<int64_t> & axes) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::squeeze, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->is_(c10::Symbol::attr("axes"),axes);
return new_node;
}
torch::jit::Node* createTanh(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::tanh, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createTile(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::tile, args, ImplicitCast::None, OutputType::AsFirstInput);
return new_node;
}
torch::jit::Node* createTranspose(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::vector<int64_t> & perm) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::transpose, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->is_(c10::Symbol::attr("perm"),perm);
return new_node;
}
torch::jit::Node* createUnsqueeze(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::vector<int64_t> & axes) {
 torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::unsqueeze, args, ImplicitCast::None, OutputType::AsFirstInput);
new_node->is_(c10::Symbol::attr("axes"),axes);
return new_node;
}


================================================
FILE: poptorch/source/ErrorOnUnsupportedAten.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.

#include <torch/csrc/jit/ir/ir.h>

#include "poptorch/PopartCanonicalization.hpp"
#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

namespace poptorch {

void errorOnUnsupportedAten(torch::jit::Graph *graph) {
  // Check that all of the "aten::" ops have been eliminated.
  std::unordered_set<torch::jit::Symbol> unsupported_ops;

  for (torch::jit::Node *node : graph->nodes()) {
    if (node->kind().is_aten()) {
      unsupported_ops.insert(node->kind());
    }
  }

  // Terminate compilation via error.
  if (!unsupported_ops.empty()) {
    std::stringstream ss;
    std::string sep;
    for (const auto &op : unsupported_ops) {
      ss << sep << op.toQualString();
      sep = ", ";
    }

    ERROR("Unsupported ops found in compiled model: ["
          << ss.str()
          << "]. Not all operations are supported yet by Graphcore's PyTorch "
             "compiler. If you believe any of these should be, please report "
             "this message to support@graphcore.ai.");
  }
}

} // namespace poptorch


================================================
FILE: poptorch/source/FixupSetAvailableMemory.cpp
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.

#include <torch/csrc/jit/ir/graph_node_list.h>
#include <torch/csrc/jit/ir/ir.h>

#include <algorithm>
#include <vector>

#include "PoptorchSymbols.hpp"
#include "poptorch/PopartCanonicalization.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

using torch::jit::Node;
using torch::jit::Value;

namespace poptorch {

namespace {
std::vector<Node *> amp_possible_input_nodes;

bool isValidInputOpForAMP(const Node *node) {
  namespace popart_syms = poptorch::symbols::popart;
  const auto kind = node->kind();
  return kind == popart_syms::gather || kind == popart_syms::lstm ||
         kind == popart_syms::matmul || kind == popart_syms::scatter ||
         kind == popart_syms::scatterreduce;
}

// Returns true if the given node was removed by searching the possible inputs
// backwards.
bool tryRemovePossibleInput(const Node *input) {
  auto input_nodes_count = amp_possible_input_nodes.size();
  auto remove_position = std::remove(amp_possible_input_nodes.rbegin(),
                                     amp_possible_input_nodes.rend(), input);
  amp_possible_input_nodes.erase(amp_possible_input_nodes.rend().base(),
                                 remove_position.base());
  return input_nodes_count > amp_possible_input_nodes.size();
}

torch::jit::Value *tryFindAncestor(torch::jit::Value *v, int depth_to_check,
                                   int depth = 0) {
  if (depth == depth_to_check) {
    if (tryRemovePossibleInput(v->node())) {
      return v;
    }
  }
  for (auto *inp : v->node()->inputs()) {
    if (torch::jit::Value *ancestor =
            tryFindAncestor(inp, depth_to_check, depth + 1)) {
      return ancestor;
    }
  }
  return nullptr;
}

} // namespace

void setAvailableMemoryAddPossibleInputOp(torch::jit::Node *node) {
  if (!isValidInputOpForAMP(node)) {
    return;
  }
  logging::trace("Adding node {} as a possible input to set_available_memory",
                 nodeToString(node));
  amp_possible_input_nodes.push_back(node);
}

void moveSetAvailableMemoryIfRequired(
    torch::jit::Node *set_available_memory_node) {
  ERROR_ON(set_available_memory_node->kind() !=
           poptorch::symbols::poptorch::set_available_memory);
  if (amp_possible_input_nodes.empty()) {
    return;
  }

  // If the current input is already in the possible inputs list, remove it,
  // and return.
  Node *current_input_node = set_available_memory_node->input(0)->node();
  if (tryRemovePossibleInput(current_input_node)) {
    return;
  }
  logging::trace("Found set_available_memory node that might need fixup: {}",
                 nodeToString(set_available_memory_node));

  // The current input isn't among the possible inputs. Try to go through the
  // inputs of the input.
  //
  // If we don't find anything: try one more level. (In some cases there is
  // a reshape followed by an add).
  //
  // We don't want to do a full search in the graph as it
  // might lead to undesired results. The search will be limited to the
  // possible grandparent and great grandparent nodes that are made of
  // decomposed ops such as scatteradd and linear. These ops are composed
  // of multiple ops, and set_available_memory needs to find the suitable
  // op among them.
  torch::jit::Value *new_input =
      tryFindAncestor(set_available_memory_node->input(0), 1);
  if (new_input == nullptr) {
    new_input = tryFindAncestor(set_available_memory_node->input(0), 2);
  }
  if (new_input == nullptr) {
    logging::trace(
        "No matching ancestor found for set_available_memory node {}",
        nodeToString(set_available_memory_node));
    return;
  }

  auto *current_input = set_available_memory_node->input(0);
  logging::trace("Replacing set_available_memory input '%{}' with '%{}'",
                 current_input->debugName(), new_input->debugName());
  // Remove set_available_memory_node from its current position
  set_available_memory_node->output()->replaceAllUsesWith(
      set_available_memory_node->input(0));

  // Replace all the uses of the new input with "set_available_memory"
  new_input->replaceAllUsesWith(set_available_memory_node->output());
  // Update set_available_memory's input
  set_available_memory_node->moveAfter(new_input->node());
  set_available_memory_node->replaceInput(0, new_input);
}

} // namespace poptorch


================================================
FILE: poptorch/source/GNNOptimizations.cpp
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
#include <algorithm>
#include <array>
#include <map>
#include <queue>
#include <torch/csrc/jit/ir/ir.h>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>

#include "popart_canonicalization/PopartCanonicalizationUtils.hpp"
#include "poptorch/OpBuilder.hpp"
#include "poptorch/PopartCanonicalization.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

#include "PoptorchSymbols.hpp"

namespace poptorch {

namespace {

using InputArgs =
    std::tuple<torch::jit::Value *, torch::jit::Value *, torch::jit::Value *>;
using GroupedInputArgs = std::array<std::vector<torch::jit::Value *>, 3>;
using GroupedOpFactory = std::function<torch::jit::Node *(
    torch::jit::Graph *, const torch::jit::node_list &,
    const std::vector<torch::jit::Value *> &)>;

void groupScatterReduceNodes(torch::jit::Graph *graph);
void groupGatherNodes(torch::jit::Graph *graph);
void initQueue(torch::jit::Graph *graph, std::queue<torch::jit::Node *> &queue,
               torch::jit::node_list &barriers);
std::size_t deduceOpStage(const torch::jit::Node *node,
                          const torch::jit::node_list &barriers);
std::vector<torch::jit::Value *>
concatGroupedInputs(torch::jit::Graph *graph, GroupedInputArgs &grouped_inputs,
                    bool with_update);
torch::jit::Node *
createGroupedScatterReduceNode(torch::jit::Graph *graph,
                               const torch::jit::node_list &scatter_nodes,
                               const std::vector<torch::jit::Value *> &inputs);
torch::jit::Node *
createGroupedGatherNode(torch::jit::Graph *graph,
                        const torch::jit::node_list &gather_nodes,
                        const std::vector<torch::jit::Value *> &inputs);
torch::jit::node_list dispatch(torch::jit::Graph *graph,
                               torch::jit::node_list &nodes,
                               const GroupedOpFactory &createGroupedOpFn,
                               bool with_update = false);
InputArgs getInputArgs(const torch::jit::Node *node, bool with_update);
GroupedInputArgs groupInputs(const torch::jit::node_list &nodes,
                             bool with_update);
torch::jit::Node *mergeNodes(torch::jit::Graph *graph,
                             const torch::jit::node_list &nodes,
                             const GroupedOpFactory &createGroupedOpFn,
                             bool with_update);
void moveOutputNodesAfterInsertionPoint(const torch::jit::node_list &nodes,
                                        torch::jit::Node *insertion_point_node);
torch::jit::node_list removeDuplicates(const torch::jit::node_list &nodes,
                                       bool with_update);
void sortInTopologicalOrder(torch::jit::node_list &nodes);
void unpackGroupedOutputs(torch::jit::Graph *graph,
                          torch::jit::Node *grouped_node,
                          const torch::jit::node_list &fused_nodes);

} // namespace

/*
 * Algorithm:
 * 1. Move the BFS around the graph and add only those that all inputs are
 *    encountered until the entire queue is scatters and gathers.
 * 2. Merge the scatters and gathers.
 * 3. Add outputs to queue and remove scatters and gathers.
 * 4. If queue is not empty go to step 1.
 */
void groupScatterReduceAndGatherNodes(torch::jit::Graph *graph) {
  groupScatterReduceNodes(graph);
  groupGatherNodes(graph);
}

void removeScatterAddIndexExpansion(torch::jit::Graph *graph) {
  const logging::LogContext ctx{"ScatterAddOptimization"};

  std::vector<torch::jit::Node *> to_delete;

  for (auto *node : graph->nodes()) {
    if (node->kind() != c10::aten::scatter_add &&
        node->kind() != c10::aten::scatter_add_ &&
        node->kind() != c10::aten::scatter_reduce &&
        node->kind() != c10::aten::scatter_reduce_) {
      continue;
    }

    auto *index = node->input(2);
    auto *index_producer = index->node();

    // Only remove index expansions.
    if (index_producer->kind() != c10::aten::expand &&
        index_producer->kind() != c10::aten::expand_as) {
      continue;
    }

    auto *src = node->input(3);
    auto *original_index = index_producer->input(0);
    const auto expanded_index_shape = shapeFromTensor(index);

    // Make sure removal is valid
    if (index->uses().size() > 1 ||
        shapeFromTensor(src) != expanded_index_shape) {
      continue;
    }

    logging::trace("Removing index expansion node: {}",
                   nodeToString(index_producer));
    node->replaceInputWith(index, original_index);
    node->i_(c10::Symbol::attr("enable_index_broadcast"), 1);

    to_delete.push_back(index_producer);
  }

  for (auto *node : to_delete) {
    node->destroy();
  }
}

namespace {
void groupScatterReduceNodes(torch::jit::Graph *graph) {
  logging::LogContext const ctx{"groupScatterReduceNodes"};

  // Queue contains fully reached nodes.
  std::queue<torch::jit::Node *> queue;
  torch::jit::node_list barriers;
  initQueue(graph, queue, barriers);

  // The unordered_map elements represent the number of times the node was
  // reached.
  std::unordered_map<torch::jit::Node *, std::size_t> node_num_visited_inputs;
  // The unordered_set elements mean that children have been added to the queue.

  static constexpr auto with_update_idx = 3;

  using ScatterKind =
      std::tuple<std::int64_t /*reduction*/, at::ScalarType /*input_type*/,
                 bool /*index_broadcast_enabled*/, bool /*with_update*/,
                 std::size_t /*stage*/, std::int64_t /*axis*/,
                 std::int64_t /*axis_size*/>;
  std::map<ScatterKind, torch::jit::node_list> scatters;

  std::size_t optimization_candidates = 0;

  // Lambda to add the children of the vertex.
  const auto add_children_to_queue = [&](const torch::jit::Node *node) {
    for (const torch::jit::Value *output : node->outputs()) {
      for (const torch::jit::Use &use : output->uses()) {
        torch::jit::Node *user = use.user;
        const auto num_user_inputs = user->inputs().size();

        auto &num_user_visited_inputs = node_num_visited_inputs[user];
        ++num_user_visited_inputs;

        if (num_user_visited_inputs == num_user_inputs) {
          queue.push(user);
          if (user->kind() == symbols::popart::scatterreduce) {
            ++optimization_candidates;
            const std::int64_t reduction =
                user->i(c10::Symbol::attr("reduction"));
            const at::ScalarType input_type = *user->input(0)
                                                   ->type()
                                                   ->expect<c10::TensorType>()
                                                   ->scalarType();

            const bool with_update = num_user_inputs == 3;
            const bool index_broadcast_enabled =
                user->i(c10::Symbol::attr("enable_index_broadcast")) != 0;
            const std::size_t stage = deduceOpStage(user, barriers);
            const std::int64_t axis = user->i(c10::Symbol::attr("axis"));
            const std::int64_t axis_size =
                user->i(c10::Symbol::attr("axis_size"));
            const ScatterKind key{
                reduction,   input_type, index_broadcast_enabled,
                with_update, stage,      axis,
                axis_size};
            scatters[key].push_back(user);
          }
        }
      }
    }
  };

  const auto merge_scatters = [&]() {
    for (auto &&[scatter_kind, scatter_vec] : scatters) {
      if (scatter_vec.size() > 1) {
        const bool with_update = std::get<with_update_idx>(scatter_kind);
        const auto &merged_scatters = dispatch(
            graph, scatter_vec, createGroupedScatterReduceNode, with_update);
        for (torch::jit::Node *scatter_node : merged_scatters) {
          add_children_to_queue(scatter_node);
        }
      } else {
        add_children_to_queue(scatter_vec.front());
      }
    }
    scatters.clear();
  };

  while (!queue.empty()) {
    auto *node = queue.front();
    queue.pop();
    const torch::jit::Symbol kind = node->kind();

    // If scatter or gather, push back.
    if (kind == symbols::popart::scatterreduce) {
      queue.push(node);
    } else {
      add_children_to_queue(node);
    }

    // If all elements of the queue are scatters and gathers.
    if (queue.size() == optimization_candidates) {
      // Clear queue.
      queue = std::queue<torch::jit::Node *>();
      optimization_candidates = 0;
      // Merge scatters and gathers that have been encountered twice.
      merge_scatters();
    }
  }
}

void groupGatherNodes(torch::jit::Graph *graph) {
  logging::LogContext const ctx{"groupGatherNodes"};

  // Queue contains fully reached nodes.
  std::queue<torch::jit::Node *> queue;
  torch::jit::node_list barriers;
  initQueue(graph, queue, barriers);

  // The unordered_map elements represent the number of times the node was
  // reached.
  std::unordered_map<torch::jit::Node *, std::size_t> node_num_visited_inputs;
  // The unordered_set elements mean that children have been added to the queue.

  using GatherKind = std::tuple<at::ScalarType /*input_type*/,
                                std::int64_t /*axis*/, std::size_t /*stage*/>;
  std::map<GatherKind, torch::jit::node_list> gathers;

  std::size_t optimization_candidates = 0;

  // Lambda to add the children of the vertex.
  const auto add_children_to_queue = [&](const torch::jit::Node *node) {
    for (const torch::jit::Value *output : node->outputs()) {
      for (const torch::jit::Use &use : output->uses()) {
        torch::jit::Node *user = use.user;
        const auto num_user_inputs = user->inputs().size();

        auto &num_user_visited_inputs = node_num_visited_inputs[user];
        ++num_user_visited_inputs;

        if (num_user_visited_inputs == num_user_inputs) {
          queue.push(user);
          if (user->kind() == symbols::popart::gather) {
            ++optimization_candidates;
            const at::ScalarType input_type = *user->input(0)
                                                   ->type()
                                                   ->expect<c10::TensorType>()
                                                   ->scalarType();

            const std::int64_t axis = user->i(c10::Symbol::attr("axis"));
            const std::size_t stage = deduceOpStage(user, barriers);
            const GatherKind key{input_type, axis, stage};
            gathers[key].push_back(user);
          }
        }
      }
    }
  };

  const auto merge_gathers = [&]() {
    for (auto &&[_, gather_vec] : gathers) {
      UNUSED(_);
      if (gather_vec.size() > 1) {
        const auto &merged_gathers =
            dispatch(graph, gather_vec, createGroupedGatherNode);
        for (torch::jit::Node *gather_node : merged_gathers) {
          add_children_to_queue(gather_node);
        }
      } else {
        add_children_to_queue(gather_vec.front());
      }
    }
    gathers.clear();
  };

  while (!queue.empty()) {
    auto *node = queue.front();
    queue.pop();
    const torch::jit::Symbol kind = node->kind();

    // If scatter or gather, push back.
    if (kind == symbols::popart::gather) {
      queue.push(node);
    } else {
      add_children_to_queue(node);
    }

    // If all elements of the queue are scatters and gathers.
    if (queue.size() == optimization_candidates) {
      // Clear queue.
      queue = std::queue<torch::jit::Node *>();
      optimization_candidates = 0;
      // Merge scatters and gathers that have been encountered twice.
      merge_gathers();
    }
  }
}

void initQueue(torch::jit::Graph *graph, std::queue<torch::jit::Node *> &queue,
               torch::jit::node_list &barriers) {
  // Add roots to queue.
  std::unordered_set<torch::jit::Node *> added;
  for (torch::jit::Node *node : graph->nodes()) {
    if (node->inputs().empty()) {
      if (added.find(node) == added.end()) {
        queue.push(node);
        added.insert(node);
      }
    }
    if (node->kind() == symbols::poptorch::begin_ipu_block) {
      barriers.push_back(node);
    }
  }
  for (torch::jit::Value *input : graph->inputs()) {
    auto *node = input->node();
    if (added.find(node) == added.end()) {
      queue.push(node);
      added.insert(node);
    }
  }
}

// Find which phase the fused operation is in
std::size_t deduceOpStage(const torch::jit::Node *node,
                          const torch::jit::node_list &barriers) {
  std::size_t stage = 0;
  while (stage < barriers.size() && !node->isBefore(barriers[stage])) {
    stage++;
  }
  return stage;
}

torch::jit::node_list dispatch(torch::jit::Graph *graph,
                               torch::jit::node_list &nodes,
                               const GroupedOpFactory &createGroupedOpFn,
                               bool with_update) {
  using Shape = std::vector<std::int64_t /*dim*/>;
  using Group = std::tuple<std::int64_t /*axis*/, Shape /*index*/,
                           Shape /*src*/, Shape /*self*/>;

  std::map<Group, torch::jit::node_list> group_to_merge_candidates;

  for (torch::jit::Node *node : nodes) {
    const std::int64_t axis = node->i(c10::Symbol::attr("axis"));
    const Shape src_shape = shapeFromTensor(node->input(0));
    const Shape index_shape = shapeFromTensor(node->input(1));
    const Shape self_shape =
        with_update ? shapeFromTensor(node->input(2)) : Shape{};

    const Group key{axis, index_shape, src_shape, self_shape};
    group_to_merge_candidates[key].push_back(node);
  }

  torch::jit::node_list grouped_nodes;
  for (auto &&[_, merge_candidates] : group_to_merge_candidates) {
    UNUSED(_);

    if (merge_candidates.size() > 1) {
      grouped_nodes.push_back(
          mergeNodes(graph, merge_candidates, createGroupedOpFn, with_update));
    } else {
      grouped_nodes.push_back(merge_candidates.front());
    }
  }

  return grouped_nodes;
}

torch::jit::Node *mergeNodes(torch::jit::Graph *graph,
                             const torch::jit::node_list &nodes,
                             const GroupedOpFactory &createGroupedOpFn,
                             bool with_update) {

  torch::jit::node_list unique_nodes =
      poptorch::removeDuplicates(nodes, with_update);
  sortInTopologicalOrder(unique_nodes);

  torch::jit::Node *insertion_point_node = unique_nodes.back();
  moveOutputNodesAfterInsertionPoint(unique_nodes, insertion_point_node);

  auto grouped_inputs = groupInputs(unique_nodes, with_update);

  const WithNodeMetadata meta{insertion_point_node};
  const torch::jit::WithInsertPoint insertion_point(insertion_point_node);

  const auto grouped_args =
      concatGroupedInputs(graph, grouped_inputs, with_update);
  torch::jit::Node *grouped_node;
  grouped_node = createGroupedOpFn(graph, unique_nodes, grouped_args);

  unpackGroupedOutputs(graph, grouped_node, unique_nodes);

  return grouped_node;
}

torch::jit::node_list removeDuplicates(const torch::jit::node_list &nodes,
                                       bool with_update) {
  std::map<InputArgs, torch::jit::Node *> input_args_to_nodes;
  std::unordered_set<torch::jit::Node *> to_destroy;

  for (torch::jit::Node *node : nodes) {
    const auto node_inputs = getInputArgs(node, with_update);
    auto stored_node_it = input_args_to_nodes.find(node_inputs);
    const bool is_duplicate = stored_node_it != input_args_to_nodes.end();
    if (is_duplicate) {
      auto *const stored_node = stored_node_it->second;
      replaceOutputUse(node->output(), stored_node->output());
      to_destroy.insert(node);
    } else {
      input_args_to_nodes.emplace(node_inputs, node);
    }
  }

  searchAndPossiblyDestroy(to_destroy);

  torch::jit::node_list unique_nodes(input_args_to_nodes.size(), nullptr);
  std::transform(input_args_to_nodes.begin(), input_args_to_nodes.end(),
                 unique_nodes.begin(), [&](const auto &input_args_to_node) {
                   return input_args_to_node.second;
                 });

  return unique_nodes;
}

void sortInTopologicalOrder(torch::jit::node_list &nodes) {
  std::sort(nodes.begin(), nodes.end(),
            [=](const torch::jit::Node *lhs, const torch::jit::Node *rhs) {
              return lhs->isBefore(rhs);
            });
}

void moveOutputNodesAfterInsertionPoint(
    const torch::jit::node_list &nodes,
    torch::jit::Node *insertion_point_node) {

  const auto collect_output_nodes = [](const torch::jit::Node *node_to_process,
                                       std::queue<torch::jit::Node *> &queue) {
    for (const auto *output : node_to_process->outputs()) {
      for (const auto &use : output->uses()) {
        const auto &user = use.user;
        queue.push(user);
      }
    }
  };

  std::unordered_set<torch::jit::Node *> collected_nodes_to_move;
  for (torch::jit::Node *node : nodes) {
    if (node == insertion_point_node) {
      continue;
    }

    std::queue<torch::jit::Node *> nodes_to_move;
    collect_output_nodes(node, nodes_to_move);
    while (!nodes_to_move.empty()) {
      torch::jit::Node *node_to_move = nodes_to_move.front();
      nodes_to_move.pop();
      if (node_to_move->isBefore(insertion_point_node) &&
          collected_nodes_to_move.find(node_to_move) ==
              collected_nodes_to_move.end()) {
        collected_nodes_to_move.insert(node_to_move);
        collect_output_nodes(node_to_move, nodes_to_move);
      }
    }
  }

  torch::jit::node_list sorted_collected_nodes_to_move;
  sorted_collected_nodes_to_move.insert(sorted_collected_nodes_to_move.end(),
                                        collected_nodes_to_move.begin(),
                                        collected_nodes_to_move.end());
  sortInTopologicalOrder(sorted_collected_nodes_to_move);
  auto *tmp_insertion_point_node = insertion_point_node;
  for (auto *node_to_move : sorted_collected_nodes_to_move) {
    node_to_move->moveAfter(tmp_insertion_point_node);
    tmp_insertion_point_node = node_to_move;
  }
}

GroupedInputArgs groupInputs(const torch::jit::node_list &nodes,
                             bool with_update) {
  const int64_t num_groups = nodes.size();

  GroupedInputArgs grouped_input_nodes;
  for (auto &input_vec : grouped_input_nodes) {
    input_vec = std::vector<torch::jit::Value *>(num_groups, nullptr);
  }

  for (int64_t group_id = 0; group_id < num_groups; ++group_id) {

    std::tie(grouped_input_nodes[0][group_id], grouped_input_nodes[1][group_id],
             grouped_input_nodes[2][group_id]) =
        getInputArgs(nodes[group_id], with_update);
  }

  return grouped_input_nodes;
}

InputArgs getInputArgs(const torch::jit::Node *node, bool with_update) {
  return {node->input(0), node->input(1),
          (with_update ? node->input(2) : nullptr)};
}

std::vector<torch::jit::Value *>
concatGroupedInputs(torch::jit::Graph *graph, GroupedInputArgs &grouped_inputs,
                    bool with_update) {

  const std::size_t num_groups = grouped_inputs[0].size();

  for (std::size_t group_id = 0; group_id < num_groups; group_id++) {
    auto &src_input = grouped_inputs[0][group_id];
    src_input = createUnsqueeze(graph, {src_input}, {0})->output();

    auto &index_input = grouped_inputs[1][group_id];
    index_input = createUnsqueeze(graph, {index_input}, {0})->output();

    if (with_update) {
      auto &self_input = grouped_inputs[2][group_id];
      self_input = createUnsqueeze(graph, {self_input}, {0})->output();
    }
  }

  std::vector<torch::jit::Value *> grouped_args;
  grouped_args.reserve(3);

  grouped_args.push_back(createConcat(graph, grouped_inputs[0], 0)->output());
  grouped_args.push_back(createConcat(graph, grouped_inputs[1], 0)->output());

  if (with_update) {
    grouped_args.push_back(createConcat(graph, grouped_inputs[2], 0)->output());
  }

  return grouped_args;
}

torch::jit::Node *
createGroupedScatterReduceNode(torch::jit::Graph *graph,
                               const torch::jit::node_list &scatter_nodes,
                               const std::vector<torch::jit::Value *> &inputs) {
  const int64_t num_groups = scatter_nodes.size();
  auto *const node_with_attributes = scatter_nodes.back();
  const auto axis_size =
      node_with_attributes->i(c10::Symbol::attr("axis_size"));
  const auto old_axis = node_with_attributes->i(c10::Symbol::attr("axis"));
  const auto reduction =
      node_with_attributes->i(c10::Symbol::attr("reduction"));
  const bool enable_index_broadcast =
      node_with_attributes->i(c10::Symbol::attr("enable_index_broadcast")) != 0;

  return createGroupedscatterreduce(graph, inputs, axis_size, old_axis + 1,
                                    num_groups, enable_index_broadcast,
                                    reduction);
}

torch::jit::Node *
createGroupedGatherNode(torch::jit::Graph *graph,
                        const torch::jit::node_list &gather_nodes,
                        const std::vector<torch::jit::Value *> &inputs) {
  const int64_t num_groups = gather_nodes.size();
  auto *const node_with_attributes = gather_nodes.back();
  const auto axis = node_with_attributes->i(c10::Symbol::attr("axis"));
  return createGroupedgather(graph, inputs, axis + 1, num_groups);
}

void unpackGroupedOutputs(torch::jit::Graph *graph,
                          torch::jit::Node *grouped_node,
                          const torch::jit::node_list &fused_nodes) {
  std::unordered_set<torch::jit::Node *> to_destroy;
  const int64_t num_groups = fused_nodes.size();

  for (int64_t group_id = 0; group_id < num_groups; ++group_id) {
    torch::jit::Node *slice = createSlice(graph, {grouped_node->output()},
                                          {group_id + 1}, {group_id}, {0});
    torch::jit::Node *squeeze = createSqueeze(graph, {slice->output()}, {0});
    // Replace outputs with grouped version.
    torch::jit::Node *node_to_replace = fused_nodes[group_id];
    for (torch::jit::Value *output : node_to_replace->outputs()) {
      replaceOutputUse(output, squeeze->output());
    }
    to_destroy.insert(node_to_replace);
  }
  // Destroy merged scatters.
  searchAndPossiblyDestroy(to_destroy);
}

} // namespace

} // namespace poptorch


================================================
FILE: poptorch/source/GatherWithExpandedIndicesOptimization.cpp
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
#include <torch/csrc/jit/ir/ir.h>
#include <unordered_set>

#include "popart_canonicalization/PopartCanonicalizationUtils.hpp"
#include "poptorch/PopartCanonicalization.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Logging.hpp"

#include "poptorch/OpBuilder.hpp"

namespace poptorch {

void simplifyGatherWithExpandedIndices(torch::jit::Graph *graph) {
  logging::LogContext const ctx{"GatherWithExpandedIndicesOptimisation"};

  std::unordered_set<torch::jit::Node *> to_delete;

  for (auto *node : graph->nodes()) {
    if (node->kind() != c10::aten::gather) {
      continue;
    }

    // aten::gather(Tensor self, int dim, Tensor index, *, bool
    //              sparse_grad=False) -> Tensor
    auto *input = node->input(0);
    const size_t gather_dim = handleDimensionParam(
        node->input(1), input->type()->expect<c10::TensorType>());
    auto *indices = node->input(2);
    auto *expand_node = indices->node();

    // Only remove index expansions.
    if (expand_node->kind() != c10::aten::expand &&
        expand_node->kind() != c10::aten::expand_as) {
      continue;
    }

    const WithNodeMetadata meta(node);
    // aten::expand(Tensor self, int[] size, *, bool implicit) -> Tensor
    // aten::expand_as(Tensor self, Tensor other) -> Tensor
    auto *original_indices = expand_node->input(0);
    auto original_indices_shape = shapeFromTensor(original_indices);

    // Get the (single) expanded dimension
    std::vector<int64_t> expand_shape{};
    if (expand_node->kind() == c10::aten::expand) {
      expand_shape = constantToLongVec(expand_node->input(1)->node());
    } else {
      expand_shape = shapeFromTensor(expand_node->input(1));
    }

    std::vector<std::size_t> expand_dims{};
    for (size_t i = 0; i < expand_shape.size(); i++) {
      if (expand_shape[i] > original_indices_shape[i]) {
        expand_dims.push_back(i);
      }
    }
    if (expand_dims.size() != 1) {
      continue;
    }
    const size_t expand_dim = expand_dims[0];

    // Only optimise if:
    // * source tensor's shape has 2 dimensions of length > 1
    // * dimension of gather, and dimension of expand are the 2 dimensions of
    //   length > 1
    const auto self_shape = shapeFromTensor(input);
    std::vector<std::size_t> non_singleton_dimensions{};
    for (size_t i = 0; i < self_shape.size(); i++) {
      if (self_shape[i] > 1) {
        non_singleton_dimensions.push_back(i);
      }
    }
    if (non_singleton_dimensions.size() != 2) {
      continue;
    }

    const auto ga_it = std::find(non_singleton_dimensions.begin(),
                                 non_singleton_dimensions.end(), gather_dim);
    const auto ex_it = std::find(non_singleton_dimensions.begin(),
                                 non_singleton_dimensions.end(), expand_dim);
    if (ga_it == ex_it || ga_it == non_singleton_dimensions.end() ||
        ex_it == non_singleton_dimensions.end()) {
      continue;
    }

    // Replace the aten::expand -> aten::gather with an
    // aten::squeeze -> aten::index_select
    logging::debug("Optimising gather: {}", nodeToString(node));
    std::vector<int64_t> squeezed_shape;
    std::copy_if(original_indices_shape.begin(), original_indices_shape.end(),
                 std::back_inserter(squeezed_shape),
                 [](auto dim) { return dim > 1; });

    torch::jit::WithInsertPoint const insert_point(node);

    torch::jit::Node *squeezed =
        createAndInsertNode(graph, c10::aten::squeeze, {original_indices},
                            ImplicitCast::None, OutputType::AsFirstInput);
    squeezed->output()->setType(
        original_indices->type()->expect<c10::TensorType>()->withSizes(
            squeezed_shape));
    torch::jit::Node *gathered =
        createAndInsertNode(graph, c10::aten::index_select,
                            {input, node->input(1), squeezed->output()},
                            ImplicitCast::None, OutputType::AsFirstInput)
            ->output()
            ->node();

    to_delete.insert(node);
    to_delete.insert(expand_node);

    if (node->hasUses()) {
      for (size_t i = 0; i < node->outputs().size(); ++i) {
        // As well as replacing the use, this will copy across shape/type
        // if not explicitly set.
        replaceOutputUse(node, gathered, i);
      }
    }
  }

  // Remove the dead nodes.
  searchAndPossiblyDestroy(to_delete);
}

} // namespace poptorch


================================================
FILE: poptorch/source/ImplicitCasting.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#include <ATen/ATen.h>
#include <ATen/native/TypeProperties.h>
#include <torch/csrc/jit/ir/ir.h>

#include <memory>
#include <vector>

#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

#include "poptorch/DispatchTracer.hpp"
#include "poptorch/ImplicitCasting.hpp"
#include "poptorch/OpBuilder.hpp"
#include "poptorch/Utils.hpp"

#include "PoptorchSymbols.hpp"

namespace poptorch {

namespace {
bool skipInput(const ImplicitCast implicit_cast, const unsigned int input_num) {
  ERROR_ON(implicit_cast == ImplicitCast::None);

  if (implicit_cast == ImplicitCast::ExceptFirst && input_num == 0) {
    return true;
  }
  if (implicit_cast == ImplicitCast::ExceptSecond && input_num == 1) {
    return true;
  }
  if (implicit_cast == ImplicitCast::ExceptThird && input_num == 2) {
    return true;
  }
  if (implicit_cast == ImplicitCast::ExceptFourthFifth &&
      (input_num == 3 || input_num == 4)) {
    return true;
  }

  return false;
}

c10::ScalarType inferExpectedTypeDispatch(
    const torch::jit::ArrayRef<torch::jit::Value *> &inputs,
    const ImplicitCast implicit_cast) {
  // Work out the types of all inputs
  at::native::ResultTypeState state = {};

  unsigned int input_num = 0;
  for (auto *input : inputs) {
    logging::LogContext const ctx(std::string("processing input ") +
                                  std::to_string(input_num));

    if (!skipInput(implicit_cast, input_num) &&
        input->type()->kind() != c10::TypeKind::NoneType) {
      auto tensor_type = input->type()->expect<c10::TensorType>();
      ERROR_ON(!tensor_type->scalarType());

      auto osizes = tensor_type->sizes().concrete_sizes();
      std::vector<int64_t> sizes;
      if (osizes) {
        sizes = *osizes;
      }
      state = at::native::update_result_type_state(
          at::native::empty_cpu(c10::IntArrayRef(sizes.data(), sizes.size()),
                                tensor_type->scalarType()),
          state);
    }
    input_num++;
  }

  return at::native::result_type(state);
}

bool needToRetype(const torch::jit::Value *input,
                  const c10::ScalarType expected_type) {
  if (input->type()->kind() == c10::TypeKind::NoneType) {
    return false;
  }

  ERROR_ON(input->node()->kind() == at::prim::Constant);

  auto input_type = input->type()->cast<c10::TensorType>()->scalarType();
  return input_type != expected_type;
}

torch::jit::Value *addCast(torch::jit::Value *input,
                           const c10::ScalarType type) {
  torch::jit::Node *node = input->node();
  auto *new_node = createCast(input->owningGraph(), input, type);
  auto current_type = input->type()->cast<c10::TensorType>();

  new_node->output()->setType(current_type->withScalarType(type));
  node->replaceInputWith(input, new_node->output());

  return new_node->output();
}

} // namespace

std::vector<torch::jit::Value *>
implicitCastInputs(torch::jit::ArrayRef<torch::jit::Value *> *inputs,
                   const ImplicitCast implicit_cast) {
  // The dispatcher version of mixed-precision type inference simply delegates
  // to PyTorch's own routines, so that we always match their decisions.
  c10::ScalarType const expected_type =
      inferExpectedTypeDispatch(*inputs, implicit_cast);

  std::vector<torch::jit::Value *> new_inputs;

  unsigned int input_num = 0;
  for (auto *input : *inputs) {
    if (!skipInput(implicit_cast, input_num) &&
        needToRetype(input, expected_type)) {
      new_inputs.push_back(addCast(input, expected_type));
    } else {
      new_inputs.push_back(input);
    }
    input_num++;
  }
  return new_inputs;
}

void removeDeadImplicitCasts(torch::jit::Graph *graph) {
  // We are removing dead code casts that result from the following cases:
  //   - Torch is dispatching a cast of a tensor in which case it should be used
  //     elsewhere and its uses won't be empty -> just delete the cast.
  //   - Torch is dispatching a cast of a wrapped number (a tensor_constant on
  //     our side) -> delete the cast and the constant.
  std::vector<torch::jit::Node *> to_delete;

  for (auto *node : graph->nodes()) {
    if (node->kind() != symbols::popart::cast || node->hasUses()) {
      continue;
    }

    to_delete.push_back(node);
    if (node->input()->uses().size() == 1) {
      // 'node' is the only use so it's safe to delete. This must be a
      // tensor_constant representing a wrapped number.
      auto *constant = node->input()->node();
      if (constant->kind() == symbols::poptorch::tensor_constant) {
        to_delete.push_back(constant);
      }
    }
  }

  for (auto *node : to_delete) {
    node->destroy();
  }
}

} // namespace poptorch


================================================
FILE: poptorch/source/InplaceOps.cpp
================================================
// Copyright (c) 2021 Graphcore Ltd. All rights reserved.

#include <torch/csrc/jit/ir/ir.h>

#include <algorithm>
#include <utility>

#include "poptorch/DispatchTracer.hpp"
#include "poptorch/InplaceOps.hpp"
#include "poptorch/InplaceOpsPyTorch.hpp_nolint"
#include "poptorch/OpBuilder.hpp"

#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

#include "popart_canonicalization/PopartCanonicalizationUtils.hpp"

#include "poptorch/Utils.hpp"

#include "PoptorchSymbols.hpp"

namespace poptorch {

namespace {
namespace aten = c10::aten;
// Ops which only have an in-place version
const std::unordered_set<torch::jit::NodeKind> &onlyInplaceOps() {
  // static to make sure values are initialised
  static const std::unordered_set<torch::jit::NodeKind> only_inplace = {
      aten::copy_, aten::normal_, aten::uniform_, aten::random_,
      aten::exponential_};
  return only_inplace;
}

// Known view operations
const std::unordered_set<torch::jit::NodeKind> &viewOps() {
  // static to make sure values are initialised
  static const std::unordered_set<torch::jit::NodeKind> view_ops = {
      aten::chunk,    aten::detach,     aten::narrow,   aten::permute,
      aten::reshape,  aten::select,     aten::slice,    aten::split,
      aten::squeeze,  aten::transpose,  aten::unbind,   aten::unsqueeze,
      aten::view,     aten::as_strided, aten::diagonal, aten::movedim,
      aten::swapaxes, aten::swapdims,   aten::view_as,  aten::_unsafe_view};
  return view_ops;
}

size_t countNumTensorOutputs(torch::jit::Graph &graph) {
  size_t num_tensors = 0;

  for (const auto &output : graph.outputs()) {
    if (output->node()->kind() == c10::prim::ListConstruct) {
      for (const auto &input : output->node()->inputs()) {
        num_tensors += numTensorsForType(input->type());
      }
    } else {
      num_tensors += numTensorsForType(output->type());
    }
  }
  return num_tensors;
}

// When replacing `node` with `new_node`, if `new_node` doesn't have enough
// inputs pad them out with None-nodes.
// NOTE: Body mostly taken from torch (see torch::jit::RemoveInplaceOps), with
// the addition of metadata.
void addAdditionalInputsIfRequired(torch::jit::Graph *graph,
                                   const torch::jit::Node *node,
                                   torch::jit::Node *new_node) {
  int additional_input_count = 0;
  if (torch::jit::expectedInputCount.find(node->kind()) !=
      torch::jit::expectedInputCount.end()) {
    additional_input_count = torch::jit::expectedInputCount.at(node->kind()) -
                             static_cast<int>(new_node->inputs().size());
  }

  const WithNodeMetadata meta(new_node);
  for (int i = 0; i < additional_input_count; ++i) {
    auto *none_node = graph->createNone();
    // NOLINTNEXTLINE readability-suspicious-call-argument
    insertNodeBeforeNode(none_node, new_node);
    new_node->addInput(none_node->output());
  }
}

torch::jit::Node *outplaceOp(torch::jit::Graph &graph, torch::jit::Node *node) {
  torch::jit::NodeKind const new_kind = outplaceKind(node->kind());

  torch::jit::WithInsertPoint const insert_point(node);
  const WithNodeMetadata meta(node);
  auto *new_node = createAndInsertNode(&graph, new_kind, node->inputs());

  addAdditionalInputsIfRequired(&graph, node, new_node);

  new_node->output()->setType(node->output()->type());
  node->output()->replaceAllUsesWith(new_node->output());

  return new_node;
}

void removeRemainingInplaceOps(torch::jit::Graph &graph) {
  std::vector<torch::jit::Node *> to_delete;
  for (auto *node : graph.nodes()) {
    // Skip if not in-place
    if (!torch::jit::isInplaceOp(node)) {
      continue;
    }

    // Keep it in place if there is only an inplace version
    if (onlyInplaceOps().count(node->kind()) != 0) {
      continue;
    }

    outplaceOp(graph, node);
    to_delete.push_back(node);
  }

  for (auto *node : to_delete) {
    node->destroy();
  }
}

} // namespace

torch::jit::NodeKind outplaceKind(torch::jit::NodeKind kind) {
  if (onlyInplaceOps().count(kind) != 0) {
    return kind;
  }

  std::string kind_str = kind.toQualString();

  torch::jit::NodeKind new_kind = kind;
  if (torch::jit::inPlaceToOutOfPlace.count(kind) != 0) {
    new_kind = torch::jit::inPlaceToOutOfPlace.at(kind);
  } else if (kind_str.back() == '_') {
    // Remove trailing '_' from the kind string
    kind_str.pop_back();
    new_kind = c10::Symbol::fromQualString(kind_str);
  }

  return new_kind;
}

void InplaceInputsTracker::addTensor(torch::jit::Value *input) {
  logging::trace("Tracking tensor %{}", input->debugName());

  const bool success = _aliases.insert({input, input}).second;
  ERROR_ON_MSG(!success, "Value already tracked");
}

torch::jit::Value *
InplaceInputsTracker::eraseCurrentAlias(torch::jit::Value *alias) {
  ERROR_ON(alias == nullptr);
  // Walk through the view ops until we find an input tensor.
  while (viewOps().count(alias->node()->kind()) != 0) {
    alias = alias->node()->input(0);
  }

  auto it = _aliases.find(alias);
  if (it != _aliases.end()) {
    auto *real_input = it->second;
    logging::trace("Deleted alias %{} for input %{}", it->first->debugName(),
                   it->second->debugName());
    // Remove current alias.
    _aliases.erase(it);
    return real_input;
  }
  return nullptr;
}

void InplaceInputsTracker::registerAlias(torch::jit::Value *aliased_input,
                                         torch::jit::Value *alias) {
  logging::trace("Registering alias %{} for input %{}", alias->debugName(),
                 aliased_input->debugName());
  ERROR_ON(!_aliases.insert({alias, aliased_input}).second);
}

InplaceGraphInfo
InplaceInputsTracker::finalizeGraph(torch::jit::Graph &graph,
                                    size_t num_anchors,
                                    bool replicas_needing_broadcast) {
  // For every alias (ie. target of an inplace op), look back and see if it's
  // applied through a bunch of views back to an input. if it is, mark it to be
  // handled later, at canonicalisation.
  for (const auto &[alias, aliased_input] : _aliases) {
    if (alias == aliased_input) {
      continue;
    }

    auto *inplace_op = alias->node();

    // Aliases are already traced back through views to graph inputs when
    // they're updated via `eraseCurrentAlias`, so can just check that the
    // ultimate input (`aliased_input`) is different to the inplace op's
    // immediate input.
    const bool was_inplace_on_view =
        !inplace_op->inputs().empty() && aliased_input != inplace_op->input(0);
    inplace_op->i_(c10::Symbol::attr("was_inplace_on_view"),
                   was_inplace_on_view ? 1 : 0);
  }

  // _aliases[alias] = graph_input -> we want the other way around.
  std::map<torch::jit::Value *, torch::jit::Value *> input_aliases;
  for (auto &p : _aliases) {
    ERROR_ON_MSG(!input_aliases.insert({p.second, p.first}).second,
                 "More than one alias for graph input %"
                     << p.second->debugName());
  }
  const size_t num_normal_tensor_outputs = countNumTensorOutputs(graph);
  InplaceGraphInfo out;
  out.num_normal_outputs = graph.outputs().size() + num_anchors;
  out.num_tensor_outputs = num_normal_tensor_outputs + num_anchors;

  const std::vector<torch::jit::Value *> collapsed_inputs =
      collapsedGraphInputHierachy(&graph);
  out.input_output_mapping.reserve(collapsed_inputs.size());
  for (const auto &graph_input : collapsed_inputs) {
    auto it = input_aliases.find(graph_input);
    ERROR_ON(it == input_aliases.end());
    size_t output_mapping = InplaceGraphInfo::no_mapping;
    if (it->first == it->second) {
      // no alias found
    } else {
      auto *alias = it->second;
      if (isParameter(graph_input)) {
        logging::trace("Alias for parameter %{} -> %{}", it->first->debugName(),
                       alias->debugName());
        // This is not supported with replicas needing broadcast
        ERROR_ON_MSG(
            replicas_needing_broadcast,
            "PopTorch does not support broadcasting buffers. If your "
            "model is able to tolerate buffers becoming out of sync "
            "between replicas, you can disable buffer broadcasting using "
            "poptorch.Options.broadcastBuffers(False).");

        const WithNodeMetadata meta(alias->node());
        auto *new_node =
            createAndInsertNode(&graph, symbols::poptorch::update_param_inplace,
                                {graph_input, alias});
        new_node->moveAfter(alias->node());
        new_node->output()->setType(alias->type());
      } else {
        logging::trace("Alias for input %{} -> %{}", it->first->debugName(),
                       alias->debugName());
        // Check if the alias is already being returned.
        for (size_t output = 0; output < graph.outputs().size(); output++) {
          if (graph.outputs()[output] == alias) {
            output_mapping = output;
          }
        }
        // If not, add a new output.
        if (output_mapping == InplaceGraphInfo::no_mapping) {
          output_mapping = graph.registerOutput(alias);

          // Ensure the overlap flag is set to no overlap (any models wanting
          // the additional efficiency of overalpped host IO should not use
          // inplace ops.)
          auto overlap_symbol =
              getOverlapSymbol("output", graph.outputs().size() - 1);
          graph.return_node()->s_(overlap_symbol, "no_overlap");
        }
      }
    }

    // The input/output mapping is only for 'true' inputs -- not parameters &
    // buffers (see its usage in PoplarExecutable::run).
    if (!isParameter(graph_input)) {
      out.input_output_mapping.push_back(output_mapping);
    }
  }

  // Outplace all the ops we can; the _aliases map no longer needs to be kept
  // up-to-date.
  removeRemainingInplaceOps(graph);

  return out;
}

void fixForLoopInputs(torch::jit::Graph &graph) {
  torch::jit::Value *correct_loop_input = nullptr;
  for (auto *node : graph.nodes()) {
    if (node->kind() == symbols::poptorch::start_for_loop) {
      ERROR_ON_MSG(correct_loop_input,
                   "[Internal] new poptorch::start_for_loop "
                   "encountered before previous poptorch::end_for_loop");
      correct_loop_input = node->input();
    } else if (node->kind() == symbols::poptorch::end_for_loop) {
      ERROR_ON_MSG(!correct_loop_input,
                   "[Internal] poptorch::end_for_loop "
                   "encountered before poptorch::start_for_loop");
      node->replaceInput(1, correct_loop_input);
      correct_loop_input = nullptr;
    }
  }
}

void verifyIfElseBlocksOrder(const torch::jit::Graph &graph) {
  // Verifies order of if...else blocks and generates friendly user error
  // messages if the order is incorrect.
  size_t if_cnt = 0;
  size_t else_cnt = 0;
  size_t end_cnt = 0;
  for (const auto *node : graph.nodes()) {
    if (node->kind() == symbols::poptorch::start_if_block) {
      if_cnt++;
    } else if (node->kind() == symbols::poptorch::start_else_block) {
      ERROR_ON_MSG(if_cnt <= else_cnt,
                   "[Internal] new poptorch::start_else_block "
                   "encountered before previous poptorch::start_if_block");
      else_cnt++;
    } else if (node->kind() == symbols::poptorch::end_if_block) {
      ERROR_ON_MSG(if_cnt < else_cnt || else_cnt <= end_cnt,
                   "[Internal] poptorch::end_if_block "
                   "encountered before poptorch::start_if_block and "
                   "poptorch::start_else_block");
      end_cnt++;
    }
  }
  ERROR_ON_MSG(!(if_cnt == else_cnt && else_cnt == end_cnt),
               "[Internal] no enclosing poptorch::end_if_block encountered");
}

} // namespace poptorch


================================================
FILE: poptorch/source/LowerToPopart.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#include "poptorch/LowerToPopart.hpp"

#include <experimental/filesystem>

#include <cstdlib>
#include <ctime>
#include <iostream>
#include <list>
#include <random>
#include <utility>

#include <spdlog/fmt/fmt.h>
#include <spdlog/fmt/ostr.h>

#include "PoptorchSymbols.hpp"
#include "popart_compiler/Compiler.hpp"
#include "popart_compiler/PopartEnums.hpp"

#include "poptorch/DispatchTracer.hpp"
#include "poptorch/InplaceOps.hpp"
#include "poptorch/PopartCanonicalization.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

namespace fs = std::experimental::filesystem;

namespace poptorch {

namespace {

std::string getModelProtoFilename() {
  if (const char *proto_file = std::getenv("POPTORCH_EXPORT_PROTO_FILE")) {
    fs::path file = fs::absolute(proto_file);
    fs::path dir = file;
    if (dir.has_extension()) {
      dir.remove_filename();
    } else {
      file += "/model.proto";
    }
    fs::create_directories(dir);
    logging::info(
        "POPTORCH_EXPORT_PROTO_FILE set: saving model prototype to {}", file);
    return file;
  }
  return "";
}

// Mapping between the SSA values of torch jit with the ssa values of popart.
// Each Value is either a single tensor, tuple or list (Note: nested tuples are
// stored flattened).
class ValueMap {
public:
  using TensorList = std::vector<popart_compiler::TensorId>;

  popart_compiler::TensorId tensor(torch::jit::Value *value) const;
  const TensorList &listTuple(torch::jit::Value *value) const;

  // Return the list of tensors without checking if it's a tuple, list or a
  // single tensor.
  const TensorList &tensors(torch::jit::Value *value) const;

  bool hasTensor(torch::jit::Value *value) const {
    return _map.count(value) == 1;
  }

  void setTensor(torch::jit::Value *value, popart_compiler::TensorId id);
  void setList(torch::jit::Value *value, const TensorList &tensors);
  void setTuple(torch::jit::Value *value, const TensorList &tensors);

private:
  struct Data {
    explicit Data(popart_compiler::TensorId id)
        : type(popart_compiler::OutputElemType::Tensor) {
      tensors.push_back(id);
    }

    Data(TensorList tuple, popart_compiler::OutputElemType type_)
        : type(type_), tensors(std::move(tuple)) {}
    popart_compiler::OutputElemType type;
    TensorList tensors;
  };
  std::unordered_map<torch::jit::Value *, Data> _map;
};

popart_compiler::TensorId ValueMap::tensor(torch::jit::Value *value) const {
  const auto it = _map.find(value);
  ERROR_ON_MSG(it == _map.cend(), value->debugName()
                                      << " not found in ValueMap");
  ERROR_ON_MSG(it->second.type != popart_compiler::OutputElemType::Tensor,
               value->debugName() << " is not a tensor");
  ERROR_ON(it->second.tensors.size() != 1);
  return it->second.tensors.front();
}

const ValueMap::TensorList &
ValueMap::listTuple(torch::jit::Value *value) const {
  const auto it = _map.find(value);
  ERROR_ON_MSG(it == _map.end(), value->debugName()
                                     << " not found in ValueMap");
  ERROR_ON_MSG((it->second.type != popart_compiler::OutputElemType::Tuple &&
                it->second.type != popart_compiler::OutputElemType::List),
               value->debugName() << " is not a tuple or list");
  return it->second.tensors;
}

const ValueMap::TensorList &ValueMap::tensors(torch::jit::Value *value) const {
  const auto it = _map.find(value);
  ERROR_ON_MSG(it == _map.end(), value->debugName()
                                     << " not found in ValueMap");
  return it->second.tensors;
}

void ValueMap::setTensor(torch::jit::Value *value,
                         popart_compiler::TensorId id) {
  ERROR_ON_MSG(!_map.emplace(value, Data(id)).second,
               "Value " << value->debugName() << " already present in the map");
}

void ValueMap::setList(torch::jit::Value *value,
                       const ValueMap::TensorList &tensors) {
  ERROR_ON_MSG(
      !_map.emplace(value, Data(tensors, popart_compiler::OutputElemType::List))
           .second,
      "Value " << value->debugName() << " already present in the map");
}

void ValueMap::setTuple(torch::jit::Value *value,
                        const ValueMap::TensorList &tensors) {
  ERROR_ON_MSG(
      !_map.emplace(value,
                    Data(tensors, popart_compiler::OutputElemType::Tuple))
           .second,
      "Value " << value->debugName() << " already present in the map");
}

/*
 * Static helper functions.
 */
const char *typeToPopartCStr(const at::ScalarType type) {

  switch (type) {
  case at::ScalarType::Float:
  case at::ScalarType::Double:
    return "FLOAT";
  case at::ScalarType::Half:
    return "FLOAT16";
  case at::ScalarType::Short:
    return "INT16";
  case at::ScalarType::Int:
  case at::ScalarType::Long:
    return "INT32";
  case at::ScalarType::Bool:
    return "BOOL";
  case at::ScalarType::Char:
    return "INT8";
  case at::ScalarType::Byte:
    return "UINT8";
  default:
    logging::err("Unimplemented type '{}'", type);
    return "UNIMPLEMENTED";
  }

  return "UNIMPLEMENTED";
}

std::vector<int64_t> getTensorDimensions(const at::Tensor &tensor) {
  const auto &sizes = tensor.sizes();
  return std::vector<int64_t>(sizes.cbegin(), sizes.cend());
}

at::ScalarType fromPopartType(const popart_compiler::PopartType type) {
  switch (type) {
  case popart_compiler::PopartType::UINT8: {
    return at::ScalarType::Byte;
  }
  case popart_compiler::PopartType::INT8: {
    return at::ScalarType::Char;
  }
  case popart_compiler::PopartType::INT16:
  case popart_compiler::PopartType::UINT16: {
    return at::ScalarType::Short;
  }
  case popart_compiler::PopartType::INT32:
  case popart_compiler::PopartType::UINT32: {
    return at::ScalarType::Int;
  }
  case popart_compiler::PopartType::INT64: {
    return at::ScalarType::Long;
  }
  case popart_compiler::PopartType::BOOL: {
    return at::ScalarType::Bool;
  }
  case popart_compiler::PopartType::FLOAT: {
    return at::ScalarType::Float;
  }
  case popart_compiler::PopartType::FLOAT16: {
    return at::ScalarType::Half;
  }
  case popart_compiler::PopartType::BFLOAT16: {
    return at::ScalarType::BFloat16;
  }
  case popart_compiler::PopartType::DOUBLE: {
    return at::ScalarType::Double;
  }
  case popart_compiler::PopartType::COMPLEX64: {
    return at::ScalarType::ComplexFloat;
  }
  case popart_compiler::PopartType::COMPLEX128: {
    return at::ScalarType::ComplexDouble;
  }
  case popart_compiler::PopartType::FLOAT8_143:
  case popart_compiler::PopartType::FLOAT8_152:
    ERROR("Can't convert a float8 PopART type to a PyTorch scalar type");
  default:
    ERROR("Unsupported PopART data type " << toPopartTypeStr(type));
  }
}

popart_compiler::PopartType toPopartType(const at::ScalarType type) {
  switch (type) {
  case at::ScalarType::Byte: {
    return popart_compiler::PopartType::UINT8;
  }
  case at::ScalarType::Char: {
    return popart_compiler::PopartType::INT8;
  }
  case at::ScalarType::Short: {
    return popart_compiler::PopartType::INT16;
  }
  case at::ScalarType::Int: {
    return popart_compiler::PopartType::INT32;
  }
  case at::ScalarType::Long: {
    return popart_compiler::PopartType::INT64;
  }
  case at::ScalarType::Bool: {
    return popart_compiler::PopartType::BOOL;
  }
  case at::ScalarType::Float: {
    return popart_compiler::PopartType::FLOAT;
  }
  case at::ScalarType::Half: {
    return popart_compiler::PopartType::FLOAT16;
  }
  case at::ScalarType::BFloat16: {
    return popart_compiler::PopartType::BFLOAT16;
  }
  case at::ScalarType::Double: {
    return popart_compiler::PopartType::DOUBLE;
  }
  case at::ScalarType::ComplexFloat: {
    return popart_compiler::PopartType::COMPLEX64;
  }
  case at::ScalarType::ComplexDouble: {
    return popart_compiler::PopartType::COMPLEX128;
  }
  default:
    ERROR("Unsupported PyTorch scalar type " << toString(type));
  }
}

void platformAgnosticTypeInfoFromIRType(
    torch::jit::Value *value, std::vector<popart_compiler::PopartType> *types,
    std::vector<std::vector<std::size_t>> *shapes) {
  const std::shared_ptr<c10::TensorType> tensor_type =
      value->type()->expect<c10::TensorType>();
  c10::ScalarType const as_scalar = *tensor_type->scalarType();

  types->emplace_back(toPopartType(as_scalar));

  c10::VaryingShape const shape = tensor_type->sizes();

  shapes->emplace_back();

  for (std::uint32_t i = 0; i < *shape.size(); ++i) {
    shapes->back().push_back(*shape[i]);
  }
}

} // namespace

namespace detail {
/*
 * Implementation of the lowering operation.
 */
class LowerToPopartImpl {
public:
  LowerToPopartImpl(torch::jit::Graph *g, InplaceGraphInfo &&inplace_info,
                    bool training,
                    std::vector<popart_compiler::Optimizer> &&opt,
                    const popart_compiler::SessionOptions &options,
                    const AttributeAccessor &attribute_accessor,
                    CPUCallbackMap &&callback, const AnchorList &&anchors);
  void lower();

  std::shared_ptr<PoplarExecutable> compile();
  std::shared_ptr<PoplarExecutable>
  loadExecutableFromFile(const std::string &input_filename);

private:
  void printWasLoweredDebug(const torch::jit::Node *node,
                            popart_compiler::TensorId first_output_tensor);
  torch::jit::Graph &_graph;

  bool _lowered;

  std::vector<at::Tensor> _parameters;
  std::vector<std::string> _parameter_names;
  InplaceGraphInfo _inplace_info;

  std::vector<popart_compiler::TensorId> _input_tensor_hooks;

  std::vector<popart_compiler::TensorId> _output_tensor_hooks;

  ValueMap _value_map;

  // Optimizer from the user.
  const std::vector<popart_compiler::Optimizer> _optimizers;

  // Tensors to be anchored other than outputs
  const AnchorList &_anchors;

  using FunctionType = std::function<popart_compiler::TensorId(
      const std::vector<popart_compiler::TensorId> &inputs,
      torch::jit::Node *)>;
  std::unordered_map<c10::Symbol, FunctionType> _functionToImplementation;

  popart_compiler::Compiler _compiler;

  CPUCallbackMap _callbacks;

  void lowerParameters();

  void lowerBody();

  void lowerReturn();

  std::string tensorNames(std::int64_t first_tensor, std::int64_t num_tensors);
  std::string tensorNames(const ValueMap::TensorList &tensors);

  std::string tensorTypesAndShapes(std::int64_t first_tensor,
                                   std::int64_t num_tensors);
  std::string tensorTypesAndShapes(const ValueMap::TensorList &tensors);

  void validateOutputShapeAndType(popart_compiler::TensorId output_tensor,
                                  torch::jit::Node *node,
                                  std::uint64_t node_output);
};

namespace {
// Remove from vec all elements vec[i] for which mask[i] is false
template <typename T>
void maskVector(std::vector<T> *vec, const std::vector<bool> &mask,
                size_t ignore_first = 0) {
  const auto predicate = [&mask, &vec, ignore_first](const T &val) {
    const auto idx = static_cast<std::size_t>(&val - &(*vec->begin()));
    if (idx < ignore_first) {
      return false;
    }
    return !mask.at(idx - ignore_first);
  };

  const auto erase_begin = std::remove_if(vec->begin(), vec->end(), predicate);
  vec->erase(erase_begin, vec->end());
}
} // namespace

/*
 * Lower to popart impl.
 */
std::shared_ptr<PoplarExecutable> LowerToPopartImpl::compile() {
  ERROR_ON_MSG(!_lowered, "You need to lower() the graph first");

  logging::LogContext const ctx("LowerToPopart::compile");
  // Init the session, this also involves compiling to poplar.
  _compiler.initSession(_optimizers, getModelProtoFilename().c_str());

  _compiler.compileAndPrepareDevice();

  std::vector<at::ScalarType> data_types;
  data_types.reserve(_output_tensor_hooks.size());
  for (const auto id : _output_tensor_hooks) {
    data_types.emplace_back(fromPopartType(_compiler.getPopartType(id)));
  }

  return std::make_shared<PoplarExecutable>(
      std::move(_compiler), std::move(_input_tensor_hooks),
      std::move(_output_tensor_hooks), std::move(data_types), _parameter_names,
      std::move(_inplace_info));
}

std::shared_ptr<PoplarExecutable>
LowerToPopartImpl::loadExecutableFromFile(const std::string &input_filename) {
  logging::LogContext const ctx("LowerToPopart::loadExecutableFromFile");
  // Init the session, this also involves compiling to poplar.
  _compiler.initSession(_optimizers, getModelProtoFilename().c_str());
  _compiler.loadExecutableAndPrepareDevice(input_filename.c_str());

  std::vector<at::ScalarType> data_types;
  data_types.reserve(_output_tensor_hooks.size());
  for (const auto id : _output_tensor_hooks) {
    data_types.emplace_back(fromPopartType(_compiler.getPopartType(id)));
  }

  return std::make_shared<PoplarExecutable>(
      std::move(_compiler), std::move(_input_tensor_hooks),
      std::move(_output_tensor_hooks), std::move(data_types), _parameter_names,
      std::move(_inplace_info));
}

void LowerToPopartImpl::lower() {
  logging::debug("Graph lowered to PopART {");
  // Lower the tensor parameters of the _graph to OpInputs.
  lowerParameters();

  // Lower the body of the _graph.
  lowerBody();

  lowerReturn();

  logging::debug("}");
  _lowered = true;
}

void LowerToPopartImpl::printWasLoweredDebug(
    const torch::jit::Node *node,
    popart_compiler::TensorId first_output_tensor) {
  logging::debug(
      "{} was lowered to {} [{},{}]", nodeToString(node),
      tensorNames(first_output_tensor, node->outputs().size()),
      tensorTypesAndShapes(first_output_tensor, node->outputs().size()),
      _compiler.getExecutionInfo().get());
}

void LowerToPopartImpl::lowerReturn() {
  // Used to encode the number of (actual) outputs
  _compiler.addOutputType(
      {popart_compiler::OutputElemType::Tuple,
       static_cast<std::int64_t>(_inplace_info.num_normal_outputs)});

  // Recursively go through the output's type to flatten its structure and
  // add it to the compiler.
  // In this representation, (T0, T1, (T2, T3), T4) would be
  // [ Tuple3, Tensor, Tensor, Tuple2, Tensor, Tensor, Tensor]

  // Only lower the outputs not used for tensors modified inplace.
  std::function<void(c10::TypePtr)> process_type;
  process_type = [this, &process_type](const c10::TypePtr &type) {
    switch (type->kind()) {
    case c10::TypeKind::TensorType: {
      _compiler.addOutputType({popart_compiler::OutputElemType::Tensor});
      break;
    }
    case c10::TypeKind::TupleType: {
      const auto tuple_type = type->expect<c10::TupleType>();
      _compiler.addOutputType(
          {popart_compiler::OutputElemType::Tuple,
           static_cast<std::int64_t>(tuple_type->elements().size())});
      for (const auto &elt_type : tuple_type->elements()) {
        process_type(elt_type);
      }
      break;
    }
    case c10::TypeKind::ListType: {
      // Use our custom type to find the number of tensors (lists can only be
      // tensors as enforced by torch JIT)

      // type->expect is static and always succeeds
      const auto list_type = type->cast<ListTypeWithNumElements>();
      ERROR_ON(!list_type);

      _compiler.addOutputType(
          {popart_compiler::OutputElemType::List,
           static_cast<std::int64_t>(list_type->numElements())});

      for (size_t i = 0; i < list_type->numElements(); i++) {
        _compiler.addOutputType({popart_compiler::OutputElemType::Tensor});
      }
      break;
    }
    default:
      ERROR("Unsupported output type '" << c10::typeKindToString(type->kind()));
    }
  };
  logging::debug("  return (");
  for (torch::jit::Value *value : _graph.outputs()) {
    const auto tensors = _value_map.tensors(value);
    const auto msg = fmt::format("    output: %{} : {} ->", value->debugName(),
                                 *value->type());

    logging::debug("{} {} [{}]", msg, tensorNames(tensors),
                   tensorTypesAndShapes(tensors));
    if (value->type()->kind() == c10::TypeKind::ListType) {
      c10::TypeKind const elt_kind =
          value->type()->expect<c10::ListType>()->getElementType()->kind();
      ERROR_ON_MSG(elt_kind != c10::TypeKind::TensorType,
                   "Unsupported list type " << c10::typeKindToString(elt_kind));
      const std::int64_t num_tensors =
          static_cast<std::int64_t>(tensors.size());
      _compiler.addOutputType(
          {popart_compiler::OutputElemType::List, num_tensors});
      logging::trace("List with num tensors: {}", num_tensors);
      for (std::int64_t i = 0; i < num_tensors; ++i) {
        _compiler.addOutputType({popart_compiler::OutputElemType::Tensor});
      }
    } else {
      process_type(value->type());
    }

    uint64_t output_num = 0;
    for (const auto id : tensors) {
      const auto overlap_symbol = getOverlapSymbol("output", output_num);
      ERROR_ON(!_graph.return_node()->hasAttribute(overlap_symbol));
      const auto overlap_str = _graph.return_node()->s(overlap_symbol);

      _compiler.addOutputTensor(id, popart_compiler::PopartOutputMode::N, 1,
                                overlap_str.c_str());
      _output_tensor_hooks.push_back(id);
      output_num++;
    }
  }
  logging::debug("  )");

  for (const auto &anchor : _anchors) {
    const char *name = anchor.name.c_str();
    popart_compiler::PopartOutputMode const output_mode =
        static_cast<popart_compiler::PopartOutputMode>(anchor.mode);
    const size_t return_period = anchor.period;

    logging::debug("  anchor ( {} {}/{} )", name,
                   outputModeToString(output_mode), return_period);

    const auto id = _compiler.createTensorId(name);
    _compiler.addOutputType({popart_compiler::OutputElemType::Tensor});
    _compiler.addOutputTensor(id);
    _output_tensor_hooks.push_back(id);
  }
}

std::string LowerToPopartImpl::tensorNames(std::int64_t first_tensor,
                                           std::int64_t num_tensors) {
  ValueMap::TensorList tensors;
  tensors.reserve(num_tensors);
  for (int i = 0; i < num_tensors; i++) {
    tensors.push_back(first_tensor + i);
  }
  return tensorNames(tensors);
}
std::string
LowerToPopartImpl::tensorNames(const ValueMap::TensorList &tensors) {
  std::string sep{};
  std::string names;
  for (const auto tensor : tensors) {
    names += sep + _compiler.tensorName(tensor);
    sep = ", ";
  }
  return names;
}

std::string LowerToPopartImpl::tensorTypesAndShapes(std::int64_t first_tensor,
                                                    std::int64_t num_tensors) {
  ValueMap::TensorList tensors;
  tensors.reserve(num_tensors);
  for (int i = 0; i < num_tensors; i++) {
    tensors.push_back(first_tensor + i);
  }
  return tensorTypesAndShapes(tensors);
}

std::string
LowerToPopartImpl::tensorTypesAndShapes(const ValueMap::TensorList &tensors) {
  std::string sep{};
  std::string shapes;

  const char *shape_inf_failed = "(shape inference failed)";

  for (const auto tensor : tensors) {
    std::ostringstream shape_str;

    try {
      const auto tensor_shape = _compiler.getSize(tensor);

      const auto dtype_chars = _compiler.getTensorDTypeString(tensor);
      shape_str << dtype_chars.get();

      if (tensor_shape == popart_compiler::Compiler::invalid_size) {
        shape_str << shape_inf_failed;
      } else {
        shape_str << "(";
        for (auto it = tensor_shape.cbegin(); it != tensor_shape.cend(); it++) {
          shape_str << *it;
          if (it + 1 != tensor_shape.cend()) {
            shape_str << ", ";
          }
        }
        shape_str << ")";
      }
    } catch (const logging::Error &) {
      shape_str << shape_inf_failed;
    }

    shapes += sep + shape_str.str();
    sep = ", ";
  }
  return shapes;
}

void LowerToPopartImpl::validateOutputShapeAndType(
    popart_compiler::TensorId output_tensor, torch::jit::Node *node,
    std::uint64_t node_output) {
  torch::jit::Value *output = node->output(node_output);
  const JitTensorInfo jit_output(output);

  at::ScalarType const popart_type =
      fromPopartType(_compiler.getPopartType(output_tensor));
  const auto popart_size = _compiler.getSize(output_tensor);
  bool match = (popart_type == jit_output.scalar_type);
  // Only validate shape if PopART's shape inference worked.
  if (match && popart_size != popart_compiler::Compiler::invalid_size) {
    match = (popart_size == jit_output.dims);
  }
  ERROR_ON_MSG(!match, "Output[" << node_output << "] mismatch: "
                                 << nodeToString(node) << " -> PopART "
                                 << tensorTypesAndShapes(output_tensor, 1));
}
// Lower the main body of the _graph.
void LowerToPopartImpl::lowerBody() {
  logging::LogContext const ctx_func("LowerToPopartImpl::lowerBody");

  for (torch::jit::Node *node : _graph.nodes()) {
    logging::LogContext const ctx("processing " + nodeToString(node));
    // Switch/lookup based on the actual int value.
    const c10::Symbol kind = node->kind();
    // When using the dispatcher metadata should always be set.
    const std::string meta =
        node->sourceRange().source()
            ? node->sourceRange().source()->text_str().str()
            : std::string{};

    ERROR_ON_MSG(meta.empty(),
                 "Source code location missing for node " + nodeToString(node));
    // Note: filename and line number might still not be available (For example
    // if the filter set by the user excludes the entire stack).
    const auto file_line_col = node->sourceRange().file_line_col();
    std::uint64_t line = 0;
    std::uint64_t col = 0;
    std::string filename;
    if (file_line_col) {
      std::tie(filename, line, col) = *file_line_col;
    }
    _compiler.setCurrentPythonCodeLocation(meta.c_str(), filename.c_str(), line,
                                           col);

    const auto itr = _functionToImplementation.find(kind);
    if (itr != _functionToImplementation.cend()) {
      // Get the torch jit SSA for the input/output values.
      std::vector<popart_compiler::TensorId> inputs;
      std::transform(node->inputs().begin(), node->inputs().end(),
                     std::back_inserter(inputs), [&](torch::jit::Value *val) {
                       // Tuples aren't supported here but it's ok because
                       // we don't support any operations which actually take in
                       // tuples.
                       return _value_map.tensor(val);
                     });

      // Call the callback
      popart_compiler::TensorId const first_output_tensor =
          itr->second(inputs, node);

      // The callback only returns the ID of the first tensor, but we know
      // the generated tensors have contiguous IDs, so we can infer the other
      // IDs.
      for (std::uint64_t i = 0; i < node->outputs().size(); ++i) {
        torch::jit::Value *output = node->output(i);
        popart_compiler::TensorId const output_tensor = first_output_tensor + i;
        ERROR_ON_MSG(!_compiler.tensorIdIsValid(output_tensor),
                     "Output " << i << " doesn't exist of Node " << *node);
        // TODO(T66614): JIT graph doesn't have any shape inference so we can't
        // validate the shapes. Revisit once we've migrated to MLIR.
        // validateOutputShapeAndType(output_tensor, node, i);
        _value_map.setTensor(output, output_tensor);
      }

      if (!_compiler.isHostSideConstant(first_output_tensor)) {
        printWasLoweredDebug(node, first_output_tensor);
      }
    } else if (kind == symbols::poptorch::end_ipu_block) {
      _compiler.clearActiveIpu();
    } else if (kind == symbols::poptorch::start_for_loop) {
      _compiler.startSubgraph();
      logging::debug("{} was lowered", nodeToString(node));
    } else if (kind == symbols::poptorch::end_for_loop) {
      const std::vector<popart_compiler::TensorId> inputs =
          _value_map.tensors(node->input(0));

      // Popart needs to know the number of outputs even though it's in the
      // graph.
      const std::size_t num_outputs = node->i(c10::Symbol::attr("num_outputs"));

      const std::int32_t trip_count =
          static_cast<std::int32_t>(node->i(c10::Symbol::attr("trip_count")));

      // Call the callback. This will pop the subgraphs from the stack.
      const popart_compiler::TensorId first_output_tensor =
          _compiler.endForLoop(trip_count, num_outputs, inputs);

      // The callback only returns the ID of the first tensor, but we know
      // the generated tensors have contiguous IDs, so we can infer the other
      // IDs.
      std::vector<popart_compiler::TensorId> outs(num_outputs);
      for (std::uint64_t i = 0; i < num_outputs; ++i) {
        outs[i] = first_output_tensor + i;
      }

      _value_map.setTuple(node->output(), outs);
      printWasLoweredDebug(node, first_output_tensor);

    } else if (kind == symbols::poptorch::start_if_block) {
      // Starting the if block means changing the internal builder state to work
      // with a new subgraph.
      _compiler.startIfBlock();
      logging::debug("{} was lowered", nodeToString(node));
    } else if (kind == symbols::poptorch::start_else_block) {
      // Starting the else block means changing the internal builder state to
      // work with a new subgraph.
      _compiler.startElseBlock();
      logging::debug("{} was lowered", nodeToString(node));
    } else if (kind == symbols::poptorch::end_if_block) {
      // Process the if condition.
      const auto &inputs = _value_map.tensors(node->input(0));
      const auto &condition = inputs[0];
      // Popart needs to know the number of outputs even though it's in the
      // graph.
      const std::size_t num_outputs =
          node->i(c10::Symbol::fromQualString("attr::num_outputs"));

      // Call the callback. This will pop the subgraphs from the stack.
      const popart_compiler::TensorId first_output_tensor =
          _compiler.endIfBlock(condition, num_outputs);

      // The callback only returns the ID of the first tensor, but we know
      // the generated tensors have contiguous IDs, so we can infer the other
      // IDs.
      std::vector<popart_compiler::TensorId> outs(num_outputs);
      std::iota(outs.begin(), outs.end(), first_output_tensor);

      _value_map.setTuple(node->output(), outs);
      printWasLoweredDebug(node, first_output_tensor);
    } else if (kind == symbols::poptorch::add_untyped_input_tensor) {
      popart_compiler::TensorId const out = _compiler.addUntypedInputTensor();
      _value_map.setTensor(node->output(), out);
      printWasLoweredDebug(node, out);
    } else if (kind == symbols::poptorch::begin_ipu_block) {
      _compiler.setActiveIpu(node->i(c10::Symbol::attr("stage")),
                             node->i(c10::Symbol::attr("phase")),
                             node->i(c10::Symbol::attr("ipu")));
    } else if (kind == symbols::poptorch::push_name_scope) {
      _compiler.pushNameScope(node->s(c10::Symbol::attr("name")).c_str());
    } else if (kind == symbols::poptorch::pop_name_scope) {
      _compiler.popNameScope();
    } else if (kind == symbols::poptorch::set_matmul_serialization) {
      popart_compiler::TensorId const input = _value_map.tensor(node->input());
      _compiler.setMatMulSerialization(
          input, node->s(c10::Symbol::attr("mode")).c_str(),
          node->i(c10::Symbol::attr("factor")),
          node->i(c10::Symbol::attr("keep_precision")));
      _value_map.setTensor(node->output(), input);
    } else if (kind == symbols::poptorch::optimizer_group) {
      std::vector<popart_compiler::TensorId> inputs;
      std::transform(node->inputs().begin(), node->inputs().end(),
                     std::back_inserter(inputs), [&](torch::jit::Value *val) {
                       return _value_map.tensor(val);
                     });

      std::uint64_t const group = node->i(c10::Symbol::attr("group"));
      _compiler.optimizerGroup(inputs, group);

    } else if (kind == symbols::poptorch::set_available_memory) {
      // Get the torch jit SSA for the input/output values.
      std::vector<std::set<popart_compiler::TensorId>> inputs;
      for (auto *input : node->inputs()) {
        inputs.emplace_back();
        auto outputs = input->node()->outputs();
        std::transform(
            std::begin(outputs), std::end(outputs),
            std::inserter(inputs.back(), std::begin(inputs.back())),
            [&](torch::jit::Value *val) { return _value_map.tensor(val); });
      }

      _compiler.setAvailableMemoryProportion(
          inputs, node->f(c10::Symbol::attr("availableMemoryProportion")));

      for (std::uint64_t i = 0; i < node->outputs().size(); ++i) {
        _value_map.setTensor(node->output(i),
                             _value_map.tensor(node->input(i)));
      }

    } else if (kind == c10::prim::Constant) {
      ERROR_ON_MSG(node->hasAttribute(c10::attr::value),
                   "Only None constants should be left in the graph after the "
                   "CanonicaliseConstants pass");
      _value_map.setTensor(node->output(), popart_compiler::NoneTensor);
    } else if (kind == c10::prim::TupleConstruct ||
               kind == c10::prim::ListConstruct) {
      // Get the torch jit SSA for the input/output values.
      torch::jit::Value *output = node->output();

      // Add the values to the value map.
      ValueMap::TensorList input_tensors;
      for (torch::jit::Value *ids : node->inputs()) {
        for (auto tensor : _value_map.tensors(ids)) {
          input_tensors.push_back(tensor);
        }
      }
      if (kind == c10::prim::TupleConstruct) {
        _value_map.setTuple(output, input_tensors);
      } else {
        _value_map.setList(output, input_tensors);
      }
      logging::debug("{} was lowered", nodeToString(node));
    } else if (kind == c10::prim::TupleUnpack ||
               kind == c10::prim::ListUnpack) {
      // Get the torch jit SSA for the input/output values.
      const auto &tensors(_value_map.listTuple(node->input()));
      auto tensor_it = tensors.begin();
      // As tuples may be nested, walk recursively to flatten all tensors
      std::function<void(c10::TypePtr, ValueMap::TensorList &)> flattened_tuple;
      flattened_tuple = [&](const c10::TypePtr &type,
                            ValueMap::TensorList &tensorList) {
        switch (type->kind()) {
        case c10::TypeKind::TensorType: {
          ERROR_ON_MSG(tensor_it == tensors.end(),
                       "Not enough tensors to unpack");
          tensorList.push_back(*tensor_it);
          tensor_it++;
          break;
        }
        case c10::TypeKind::TupleType: {
          auto tuple = type->expect<c10::TupleType>();
          for (const auto &elt_type : tuple->elements()) {
            flattened_tuple(elt_type, tensorList);
          }
          break;
        }
        default:
          ERROR("Unsupported type '" << c10::typeKindToString(type->kind()));
        }
      };

      for (auto *output : node->outputs()) {
        switch (output->type()->kind()) {
        case c10::TypeKind::TensorType: {
          ERROR_ON(tensor_it == tensors.end());
          _value_map.setTensor(output, *tensor_it);
          tensor_it++;
          break;
        }
        case c10::TypeKind::ListType: // (should only have TensorType)
        case c10::TypeKind::TupleType: {
          ValueMap::TensorList tensor_list;
          flattened_tuple(output->type(), tensor_list);
          _value_map.setTuple(output, tensor_list);
          break;
        }
        default:
          ERROR("Unsupported parameter type '"
                << c10::typeKindToString(output->type()->kind()));
        }
      }
      ERROR_ON_MSG(tensor_it != tensors.end(), "Didn't unpack all the tensors");
      logging::debug("{} was lowered", nodeToString(node));
    } else if (kind == symbols::poptorch::host_side_cast) {
      // Map to the input value since the type will be cast host side
      ERROR_ON_MSG(!_value_map.hasTensor(node->input()),
                   "Input to host side cast has not been registered");

      ERROR_ON_MSG(node->inputs().size() != 1,
                   "Host side cast should only have one input.");

      _value_map.setTensor(node->output(), _value_map.tensor(node->input()));

    } else if (kind == symbols::poptorch::multi_conv_part) {
      std::vector<popart_compiler::TensorId> inputs;
      std::transform(node->inputs().begin(), node->inputs().end(),
                     std::back_inserter(inputs), [&](torch::jit::Value *val) {
                       return _value_map.tensor(val);
                     });

      _compiler.addMultiConvPart(inputs,
                                 node->is(c10::Symbol::attr("dilations")),
                                 node->is(c10::Symbol::attr("kernel_shape")),
                                 node->is(c10::Symbol::attr("pads")),
                                 node->is(c10::Symbol::attr("strides")));

      logging::debug("{} was lowered as component of MultiConv",
                     nodeToString(node));

    } else if (kind == symbols::poptorch::end_multi_conv) {
      // Extract multiconv options that are set as attributes on the
      // end_multi_conv instruction
      const auto amp = c10::Symbol::attr("available_memory_proportions");
      if (node->hasAttribute(amp)) {
        _compiler.setMultiConvAvailableMemoryProportions(node->fs(amp));
      }

      const auto partials_types = c10::Symbol::attr("partials_types");
      if (node->hasAttribute(partials_types)) {
        _compiler.setMultiConvPartialsTypes(node->is(partials_types));
      }

      const auto conv_ditherings = c10::Symbol::attr("enable_conv_dithering");
      if (node->hasAttribute(conv_ditherings)) {
        _compiler.setMultiConvEnableConvDithering(node->is(conv_ditherings));
      }

      const auto plan_type = c10::Symbol::attr("plan_type");
      if (node->hasAttribute(plan_type)) {
        _compiler.setMultiConvPlanType(node->i(plan_type));
      }

      const auto per_conv_reserved_tiles =
          c10::Symbol::attr("per_conv_reserved_tiles");
      if (node->hasAttribute(per_conv_reserved_tiles)) {
        _compiler.setMultiConvPerConvReservedTiles(
            node->i(per_conv_reserved_tiles));
      }

      const auto cycle_back_off = c10::Symbol::attr("cycle_back_off");
      if (node->hasAttribute(cycle_back_off)) {
        _compiler.setMultiConvCycleBackOff(node->f(cycle_back_off));
      }

      const torch::jit::ArrayRef<torch::jit::Value *> node_outputs =
          node->outputs();
      std::vector<popart_compiler::TensorId> outputs = _compiler.endMultiConv();
      ERROR_ON_MSG(outputs.size() != node_outputs.size(),
                   "Wrong number of outputs for MultiConv. Expected "
                       << node_outputs.size() << " outputs but only received "
                       << outputs.size() << " outputs.");

      for (size_t i = 0; i < outputs.size(); i++) {
        _value_map.setTensor(node_outputs[i], outputs[i]);
      }

      printWasLoweredDebug(node, outputs.front());
    } else if (kind == symbols::poptorch::canonicalised_cpu_call) {
      // CPU callbacks are referenced by an string identifier.
      std::string const id = node->s(c10::Symbol::attr("ID"));

      std::vector<popart_compiler::PopartType> input_types;
      std::vector<std::vector<std::size_t>> input_shapes;

      // Get the torch jit SSA for the input/output values.
      std::vector<popart_compiler::TensorId> inputs;
      std::transform(node->inputs().begin(), node->inputs().end(),
                     std::back_inserter(inputs), [&](torch::jit::Value *val) {
                       // Append type info from the inputs.
                       platformAgnosticTypeInfoFromIRType(val, &input_types,
                                                          &input_shapes);

                       return _value_map.tensor(val);
                     });

      std::vector<popart_compiler::PopartType> output_types;
      std::vector<std::vector<std::size_t>> output_shapes;

      for (torch::jit::Value *value : node->outputs()) {
        platformAgnosticTypeInfoFromIRType(value, &output_types,
                                           &output_shapes);
      }

      popart_compiler::TensorId const first_output_tensor =
          _compiler.addCPUCallback(inputs, _callbacks[id], input_types,
                                   input_shapes, output_types, output_shapes);

      for (std::uint64_t i = 0; i < node->outputs().size(); ++i) {
        torch::jit::Value *output = node->output(i);
        popart_compiler::TensorId const output_tensor = first_output_tensor + i;
        ERROR_ON_MSG(!_compiler.tensorIdIsValid(output_tensor),
                     "Output " << i << " doesn't exist of Node " << *node);
        _value_map.setTensor(output, output_tensor);
      }
      printWasLoweredDebug(node, first_output_tensor);
    } else if (kind == symbols::poptorch::set_attribute) {
      const std::string &attribute = node->s(c10::Symbol::attr("attribute"));
      const std::string &key = node->s(c10::Symbol::attr("key"));
      const std::string &value = node->s(c10::Symbol::attr("value"));
      _compiler.setAttribute(attribute.c_str(), key.c_str(), value.c_str());
    } else if (kind == symbols::poptorch::clear_attribute) {
      const std::string &attribute = node->s(c10::Symbol::attr("attribute"));
      const std::string &key = node->s(c10::Symbol::attr("key"));
      _compiler.clearAttribute(attribute.c_str(), key.c_str());
    } else {
      ERROR("Couldn't find a registered operation for node " << *node);
    }
  }
}

void LowerToPopartImpl::lowerParameters() {
  // The "true" inputs are a mixture of tuples (which may be nested) and tensors
  // The parameters are all tensors. "_graph.inputs()." contains the inputs
  // first followed by the parameters at the end.

  // This will provide a view of all the tensors in _graph.inputs(), i.e.
  // by collapsing tuples.
  auto graph_t_inputs = collapsedGraphInputHierachy(&_graph);

  // Step 0, remove unused parameters
  // graph_t_inputs is updated but _graph.inputs() will retain unused
  // parameters
  std::vector<bool> parameter_used(graph_t_inputs.size(), true);
  for (size_t i = 0; i < graph_t_inputs.size(); ++i) {
    auto *value = graph_t_inputs[i];
    if (value->uses().empty() && isParameter(value)) {
      parameter_used.at(i) = false;
      logging::trace("Skipping unused parameter: %{}", value->debugName());
    }
  }
  maskVector(&graph_t_inputs, parameter_used);

  // Step 1, add tensor inputs for all tensors in the hierarchy and obtain
  // the resulting popart IDs. This can be done with collapsed hierarchy.
  ValueMap::TensorList parameter_popart_ids;
  std::vector<torch::jit::Value *> parameter_values;
  size_t input_index = 0;
  size_t param_index = 0;
  for (auto *value : graph_t_inputs) {
    JitTensorInfo info = JitTensorInfo(value);
    const char *popart_type = typeToPopartCStr(info.scalar_type);
    if (isParameter(value)) {
      void *data_ptr = getDataSourceForValue(value);
      ERROR_ON_MSG(value->uses().empty(),
                   "Parameter %"
                       << value->debugName()
                       << " isn't used and therefore should have been removed");
      ERROR_ON(param_index > _parameter_names.size());
      const std::string name = getParameterName(value);
      ERROR_ON_MSG(name.empty(), "No parameter name available for value %"
                                     << value->debugName());
      _parameter_names.push_back(name);

      popart_compiler::TensorId id;
      PerReplicaSettings pr_settings;
      if (getParameterPerReplica(value, pr_settings)) {
        std::vector<std::int64_t> dims(info.dims.size() + 1);
        dims[0] = pr_settings.size0;
        memcpy(&dims[1], info.dims.data(),
               info.dims.size() * sizeof(std::int64_t));
        id = _compiler.addInitializedInputTensor(
            name.c_str(), popart_type, dims, pr_settings.host_buffer->data(),
            pr_settings.comm_group_type, pr_settings.shards,
            pr_settings.variable_retrieval_mode);
      } else {
        id = _compiler.addInitializedInputTensor(name.c_str(), popart_type,
                                                 info.dims, data_ptr);
      }
      // Compiler knows which buffers are updatable
      _compiler.registerUpdatableNamedBuffer(id);
      parameter_values.push_back(value);
      parameter_popart_ids.push_back(id);
      param_index++;
    } else {
      auto overlap_symbol = getOverlapSymbol("input", input_index);
      std::string overlap_str("no_overlap");
      if (_graph.param_node()->hasAttribute(overlap_symbol)) {
        overlap_str = _graph.param_node()->s(overlap_symbol);
      }

      const auto id =
          _compiler.addInputTensor(popart_type, info.dims, overlap_str.c_str());
      _input_tensor_hooks.push_back(id);
      input_index++;
    }
  }

  // Step 2, map the PopART tensor IDs to the JIT Value of the (not collapsed)
  // graph inputs
  logging::debug("graph(");
  auto input_tensor_it = _input_tensor_hooks.begin();
  size_t index = 0;
  for (torch::jit::Value *value : _graph.inputs()) {
    if (isParameter(value)) {
      // Only process inputs
      continue;
    }
    ERROR_ON(value->node()->kind() != c10::prim::Param);
    const size_t num_tensors = numTensorsForType(value->type());

    ValueMap::TensorList tensors;
    tensors.reserve(num_tensors);

    for (size_t i = 0; i < num_tensors; i++) {
      ERROR_ON(input_tensor_it == _input_tensor_hooks.end());
      tensors.push_back(*input_tensor_it);
      input_tensor_it++;
    }

    if (value->type()->kind() == c10::TypeKind::TensorType) {
      ERROR_ON(tensors.size() != 1);
      _value_map.setTensor(value, tensors.front());
    } else {
      ERROR_ON(value->type()->kind() != c10::TypeKind::TupleType);
      _value_map.setTuple(value, tensors);
    }

    const auto msg = fmt::format("      input: %{} : {} ->", value->debugName(),
                                 *value->type());
    logging::debug("{} {} [{}]", msg, tensorNames(tensors),
                   tensorTypesAndShapes(tensors));

    index++;
  }

  // Step 3, map the PopART tensor IDs to the JIT Value of the parameters
  for (index = 0; index < parameter_popart_ids.size(); index++) {
    auto *value = parameter_values.at(index);
    auto &tensor(parameter_popart_ids.at(index));

    const auto msg = fmt::format("      param: %{} : {} ->", value->debugName(),
                                 *value->type());
    logging::debug("{} {} [{}]", msg, tensorNames(tensor, 1),
                   tensorTypesAndShapes(tensor, 1));
    _value_map.setTensor(value, tensor);
  }
  logging::debug("  ):");
}

namespace {
// Helper to let us filter string arguments into const char*s. This is to catch
// the std::string produced by some attributes before they cross the ABI
// boundary.

template <typename T> T convertType(T &&t) { return t; }

// String, return const char*.
const char *convertType(const std::string &s) {
  return s.c_str(); // NOLINT
}

// vector<string>, return vector<const char*>
std::vector<const char *> convertType(const std::vector<std::string> &s) {
  std::vector<const char *> result;
  std::transform(s.begin(), s.end(), std::back_inserter(result),
                 [](const std::string &str) {
                   return str.c_str(); // NOLINT
                 });
  return result;
}

// vector<double, return vector<float>
std::vector<float> convertType(const std::vector<double> &v) {
  std::vector<float> result;
  std::transform(v.begin(), v.end(), std::back_inserter(result),
                 [](double d) { return static_cast<float>(d); });
  return result;
}

popart_compiler::PopartConstant
convertTensorConstantNode(const torch::jit::Node *node) {
  logging::LogContext const ctx("convertTensorConstantNode: processing " +
                                nodeToString(node));

  ERROR_ON_MSG(
      node->kind() != symbols::poptorch::tensor_constant,
      "Only a popart_compiler::tensor_constant can be converted into a popart "
      "constant");
  auto output_type =
      *node->output()->type()->expect<c10::TensorType>()->scalarType();
  auto tensor_type = getNodeTensorAttrValue(node).scalar_type();

  ERROR_ON_MSG(output_type != tensor_type, "Output type is "
                                               << c10::toString(output_type)
                                               << " but tensor type is "
                                               << c10::toString(tensor_type));

  auto tensor = getNodeTensorAttrValue(node);
  ERROR_ON(!tensor.is_contiguous());

  return {toPopartType(tensor.scalar_type()), tensor.data_ptr(),
          getTensorDimensions(tensor)};
}

popart_compiler::HostSideConstant
convertHostSideTensorConstantNode(const torch::jit::Node *node) {
  logging::LogContext const ctx(
      "convertHostSideTensorConstantNode: processing " + nodeToString(node));
  ERROR_ON_MSG(node->kind() != symbols::poptorch::host_side_tensor_constant,
               "Only a poptorch::host_side_tensor_constant can be converted "
               "into a host side constant constant");

  auto tensor = getNodeTensorAttrValue(node);
  ERROR_ON(!tensor.is_contiguous());

  return {toPopartType(tensor.scalar_type()), tensor.data_ptr(),
          tensor.nbytes(), getTensorDimensions(tensor)};
}

void processListAttribute(
    const char *name,
    const std::shared_ptr<std::vector<popart_compiler::PopartAttribute>>
        &attributes,
    const IPyValue &elements) {
  const auto first_element = elements.getFromList(0);

  if (first_element->isInt()) {
    std::vector<int64_t> ints;
    ints.reserve(elements.getListSize());
    elements.forEachInList([&ints](const IPyValue &int_obj) {
      ints.push_back(int_obj.toInt64());
    });
    attributes->emplace_back(name, ints);
    return;
  }

  if (first_element->isDouble()) {
    std::vector<float> floats;
    floats.reserve(elements.getListSize());
    elements.forEachInList([&floats](const IPyValue &float_obj) {
      floats.push_back(float_obj.toFloatWithRangeCheck());
    });
    attributes->emplace_back(name, floats);
    return;
  }

  if (first_element->isString()) {
    std::vector<std::unique_ptr<char[]>> strs;
    strs.reserve(elements.getListSize());
    elements.forEachInList([&strs](const IPyValue &str) {
      strs.emplace_back(stringToUniquePtr(str.toString()));
    });
    attributes->emplace_back(name, strs);
    return;
  }

  ERROR("Invalid type for Popart attribute.");
}

std::shared_ptr<std::vector<popart_compiler::PopartAttribute>>
convertCustomOpAttributes(const torch::jit::Node *node,
                          const AttributeAccessor &attribute_accessor) {
  logging::LogContext const ctx("convertCustomOpAttributes: processing " +
                                nodeToString(node));
  std::string const attributes_id_str(
      node->s(c10::Symbol::attr("attributes_id")));

  auto dict_obj = attribute_accessor(attributes_id_str);
  auto attributes =
      std::make_shared<std::vector<popart_compiler::PopartAttribute>>();
  dict_obj->forEachInDict([&attributes](const IPyValue &key,
                                        const IPyValue &attribute) {
    auto name = key.toString();

    if (attribute.isInt()) {
      attributes->emplace_back(name.c_str(), attribute.toInt64());
    } else if (attribute.isDouble()) {
      attributes->emplace_back(name.c_str(), attribute.toFloatWithRangeCheck());
    } else if (attribute.isString()) {
      attributes->emplace_back(name.c_str(),
                               stringToUniquePtr(attribute.toString()));
    } else if (attribute.isSetListOrTuple()) {
      processListAttribute(name.c_str(), attributes, attribute);
    } else {
      ERROR("Invalid attribute type");
    }
  });

  return attributes;
}
} // namespace

LowerToPopartImpl::LowerToPopartImpl(
    torch::jit::Graph *g, InplaceGraphInfo &&inplace_info, bool training,
    std::vector<popart_compiler::Optimizer> &&opt,
    const popart_compiler::SessionOptions &options,
    const AttributeAccessor &attribute_accessor, CPUCallbackMap &&callback,
    const AnchorList &&anchors)
    : _graph(*g), _lowered(false), _inplace_info(std::move(inplace_info)),
      _optimizers(opt), _anchors(anchors), _compiler(training, options),
      _callbacks(callback) {
  // Init the function implementation map. This map will be populated by
  // elements which look something like:
  /* {"popart::Foo", [&](const std::vector<popart_compiler::TensorId> &inputs,
     torch::jit::Node *node) { return _compiler.foo(inputs,
          node->i("attr::SomeIntegerAttr"),
    node->i("attr::SomeOtherIntegerAttr"), node->is("attr::AnIntArrayAttr"),
    node->f("attr::AFloatAttr"));
      }
    },
  */
  // Essentially this is just a map from the string IR symbol to a function to
  // be called that implements it. Those functions are also autogenerated by the
  // same macros in _compiler.hpp and _compiler.cpp.
  _functionToImplementation = {
// Torch JIT api defines the attribute accessor as the following function names.
#define INT_VEC is
#define FLOAT_VEC fs
#define FLOAT f
#define INT i
#define CHAR i
#define BOOL i
#define STRING s
#define STRING_VEC ss

// Useful NOP macro
#define NONE

// The arguments are processed by extracting the given type using the above
// accessors, the name is converted into "attr::NAME" which is what pytorch JIT
// expects for attribute accessing.
#define ARG(Type, Name) , convertType(node->Type(c10::Symbol::attr(#Name)))

#define POPART_CONST_ARG(unused) , convertTensorConstantNode(node)
#define HOST_SIDE_CONST_ARG(unused)                                            \
  , std::move(convertHostSideTensorConstantNode(node))

#define POPART_ATTRIB_VEC_ARG(unused)                                          \
  , convertCustomOpAttributes(node, attribute_accessor)

#define BODY_ARG(Name) NONE

// Create a function decl with the given call and arguments.
#define OP_DECL(ns, symbolName, function, unused, Args, unused2)               \
  {symbols::ns::symbolName,                                                    \
   [&](const std::vector<popart_compiler::TensorId> &inputs,                   \
       torch::jit::Node *node) {                                               \
     (void)(node);                                                             \
     return _compiler.function(inputs Args);                                   \
   }},

#define OP_DECL_NO_RETURN(ns, symbolName, function, unused, Args, unused2)     \
  {symbols::ns::symbolName,                                                    \
   [&](const std::vector<popart_compiler::TensorId> &inputs,                   \
       torch::jit::Node *node) {                                               \
     _compiler.function(inputs Args);                                          \
     ERROR_ON_MSG(node->outputs().size() != 0,                                 \
                  "Void return function called on torch::jit::Node which has " \
                  "outputs");                                                  \
     return popart_compiler::TensorId{};                                       \
   }},

#include "popart_compiler/SupportedOperations.inc.hpp"

#undef BODY_STR_ARG
#undef STR_ARG
#undef BODY_ARG
#undef POPART_ATTRIB_VEC_ARG
#undef HOST_SIDE_CONST_ARG
#undef POPART_CONST_ARG
#undef OP_DECL
#undef OP_DECL_NO_RETURN
#undef ARG
#undef NONE
#undef BOOL
#undef CHAR
#undef STRING
#undef STRING_VEC
#undef INT
#undef FLOAT
#undef FLOAT_VEC
#undef INT_VEC
  }; // End map initalizer.
}
} // namespace detail

LowerToPopart::LowerToPopart(torch::jit::Graph *graph,
                             InplaceGraphInfo &&inplace_info, bool training,
                             std::vector<popart_compiler::Optimizer> &&opt,
                             const popart_compiler::SessionOptions &options,
                             const AttributeAccessor &attribute_accessor,
                             CPUCallbackMap callbacks, AnchorList &&anchors) {
  _impl = std::make_unique<detail::LowerToPopartImpl>(
      graph, std::move(inplace_info), training, std::move(opt),
      std::move(options), attribute_accessor, std::move(callbacks),
      std::move(anchors));
}
void LowerToPopart::lower() { _impl->lower(); }

std::shared_ptr<PoplarExecutable> LowerToPopart::compile() {
  auto executable = _impl->compile();
  if (logging::outputPopartIR()) {
    logging::debug("Popart IR: {}", executable->getPopartIR());
  }
  return executable;
}

std::shared_ptr<PoplarExecutable>
LowerToPopart::loadExecutableFromFile(const std::string &input_filename) {
  return _impl->loadExecutableFromFile(input_filename);
}

LowerToPopart::~LowerToPopart() = default;

LowerToPopart::LowerToPopart(LowerToPopart &&lower) {
  _impl = std::move(lower._impl);
}

} // namespace poptorch


================================================
FILE: poptorch/source/LowerToPopartFactories.cpp
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
#include "poptorch/LowerToPopartFactories.hpp"

#include <torch/csrc/jit/passes/constant_propagation.h>
#include <torch/csrc/jit/passes/dead_code_elimination.h>
#include <torch/csrc/jit/passes/lower_graph.h>
#include <torch/csrc/jit/passes/lower_tuples.h>
#include <torch/csrc/jit/passes/peephole.h>
#include <torch/script.h>

#include "popart_compiler/Compiler.hpp"
#include "poptorch_logging/Logging.hpp"
#include "poptorch_logging/Tracepoint.hpp"

#include "poptorch/AliasProcessing.hpp"
#include "poptorch/DispatchTracer.hpp"
#include "poptorch/ImplicitCasting.hpp"
#include "poptorch/InplaceOps.hpp"
#include "poptorch/OverlappedIO.hpp"
#include "poptorch/PopartCanonicalization.hpp"
#include "poptorch/RequiresGrad.hpp"
#include "poptorch/TypeAndConstantCanonicalization.hpp"
#include "poptorch/Utils.hpp"

namespace poptorch {

poptorch::LowerToPopart lowerToPopartFromDispatch(
    SessionOptionsParser &parser, bool training, AnchorList &&anchors_list,
    const std::function<void()> &initCallbackBuffers,
    std::vector<popart_compiler::Optimizer> &&optimizers,
    const AttributeAccessor &attribute_accessor, CPUCallbackMap &callbacks) {
  auto &parsed_options = parser.options();
  const std::shared_ptr<torch::jit::Graph> graph = getTracedGraph();
  logging::trace("Initial dispatched graph:\n{}", *graph);

  fixRequiresGradFromDispatch(graph.get());
  torch::jit::EliminateDeadCode(graph);
  torch::jit::PeepholeOptimize(graph);
  logging::trace("Optimised graph:\n{}", *graph);

  InplaceGraphInfo inplace_info = getInplaceGraphInfo(
      anchors_list.size(), parsed_options.hasInputReplication() &&
                               parsed_options.broadcastBuffers());
  logging::trace("Graph after handling inplace ops:\n{}", *graph);

  poptorch::attributiseOverlappedIO(graph.get());
  logging::trace("Graph after attributising IO overlap specifiers:\n{}",
                 *graph);

  fixForLoopInputs(*graph);

  verifyIfElseBlocksOrder(*graph);

  poptorch::type_and_constant_canonicalization::evaluateConstexprs(graph.get());
  logging::trace("Graph after evaluating constant expressions:\n{}", *graph);

  poptorch::type_and_constant_canonicalization::canonicaliseConstants(
      graph.get());
  logging::trace("Graph after constant canonicalisation:\n{}", *graph);

  poptorch::removeScatterAddIndexExpansion(graph.get());

  poptorch::simplifyGatherWithExpandedIndices(graph.get());

  logging::trace("Graph before PopART canonicalisation:\n{}", *graph);
  poptorch::canonicalize(graph.get());

  logging::trace("Graph before PopART grouping gathers and scatters:\n{}",
                 *graph);

  poptorch::groupScatterReduceAndGatherNodes(graph.get());

  poptorch::annotateSubgraphs(graph.get(), graph->nodes().front());

  // Collapse any `begin_cpu ... end_cpu` sequences into a single node, with the
  // correct inputs & outputs.
  poptorch::cpuOffloadingCleanup(graph.get());

  if (graph->outputs().empty()) {
    logging::trace("No outputs, so all nodes cleared");
    for (auto it = graph->nodes().rbegin(); it != graph->nodes().rend(); it++) {
      it.destroyCurrent();
    }
  }

  // TODO(T67295): remove after we use our own dispatch key.
  removeDeadImplicitCasts(graph.get());

  canonicalizeLate(graph.get());
  logging::trace("Graph after PopART canonicalisation:\n{}", *graph);

  if (training) {
    poptorch::addDetachOperations(graph.get());
    poptorch::removeSurplusIdentityLosses(graph.get());
    logging::trace("Graph after adding detach operations:\n{}", *graph);
  }

  // Error the user if any operations couldn't be canonicalised.
  poptorch::errorOnUnsupportedAten(graph.get());

  // Prepare CPU op callbacks, by allocating the CPU tensors where the
  // inputs/outputs will be stored. We have to do this at the last possible
  // moment due to tracing.
  initCallbackBuffers();

  logging::trace("Graph before lowering to PopART:\n{}", *graph);
  poptorch::LowerToPopart lower(
      graph.get(), std::move(inplace_info), training, std::move(optimizers),
      parsed_options, attribute_accessor, callbacks, std::move(anchors_list));

  lower.lower();

  return lower;
}

} // namespace poptorch


================================================
FILE: poptorch/source/OpBuilder.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.

#include "popart_compiler/PopartEnums.hpp"
#include "popart_compiler/Utils.hpp"

#include "poptorch_logging/Logging.hpp"

#include "poptorch/DispatchTracer.hpp"
#include "poptorch/OpBuilder.hpp"
#include "poptorch/PopartCanonicalization.hpp"

#include "poptorch/Utils.hpp"

#include "PoptorchSymbols.hpp"

#include <ATen/ATen.h>

namespace poptorch {
namespace {

at::ScalarType scalarTypeFromInput(const torch::jit::Node *node, size_t num) {
  ERROR_ON_MSG(node->inputs().size() <= num,
               "Cannot get scalar type from input " << num
                                                    << " as it does not exist");
  return *node->input(num)->type()->expect<c10::TensorType>()->scalarType();
}

class SourceLocation {
public:
  // SourceLocation is considered enabled if a location or metadata
  // has been explicitly set.
  bool isEnabled() const { return _enabled; }

  void setLocation(const std::string &filename, std::uint64_t line) {
    _enabled = true;
    _dirty = true;
    _filename = filename;
    _line = line;
  }

  void setMetadata(const std::string &metadata) {
    _enabled = true;
    _dirty = true;
    _metadata = metadata;
  }

  const torch::jit::SourceRange &sourceRange() {
    if (_dirty) {
      _dirty = false;
      c10::optional<std::string> filename;
      if (!_filename.empty()) {
        filename = _filename;
      }
      ERROR_ON_MSG(
          _metadata.empty(),
          "[Internal] Metadata missing (setCurrentMetadata() missing)");
      auto source =
          std::make_shared<torch::jit::Source>(_metadata, filename, _line);
      _source_range = torch::jit::SourceRange(source, 0, 1);
    }
    return _source_range;
  }

private:
  bool _enabled{false};
  bool _dirty{false};
  torch::jit::SourceRange _source_range;
  std::string _metadata;
  std::string _filename;
  std::uint64_t _line;
} current_source_location = {};

} // namespace

void resetCurrentSourceLocation() {
  current_source_location = SourceLocation();
}

void setCurrentPythonCodeLocation(
    const torch::jit::SourceRange &source_location) {
  auto file_line_col = source_location.file_line_col();
  std::uint64_t line = 0;
  std::uint64_t col = 0;
  std::string filename;
  if (file_line_col) {
    std::tie(filename, line, col) = *file_line_col;
  }
  current_source_location.setLocation(filename, line);
}

void setCurrentMetadata(const std::string &metadata) {
  current_source_location.setMetadata(metadata);
}

WithNodeMetadata::WithNodeMetadata(torch::jit::Node *node) {
  // If no source location has been set yet
  // then the node won't contain any location information.
  if (current_source_location.isEnabled()) {
    std::string meta;
    auto sr = node->sourceRange();
    if (sr.source()) {
      meta = sr.source()->text_str().str();
    }
    setCurrentPythonCodeLocation(sr);
    setCurrentMetadata(meta);
  }
}

WithNodeMetadata::~WithNodeMetadata() {
  if (current_source_location.isEnabled()) {
    setCurrentPythonCodeLocation({});
    setCurrentMetadata("");
  }
}

torch::jit::Node *createNode(torch::jit::Graph *graph,
                             torch::jit::NodeKind kind,
                             torch::jit::ArrayRef<torch::jit::Value *> inputs,
                             const ImplicitCast implicit_cast,
                             OutputType output_type, size_t num_outputs,
                             c10::optional<at::ScalarType> dtype) {
  torch::jit::Node *new_node;

  if (implicit_cast != ImplicitCast::None && !inputs.empty()) {
    logging::LogContext ctx(std::string("implicitly casting inputs of ") +
                            kind.toQualString());
    auto possibly_cast_inputs = implicitCastInputs(&inputs, implicit_cast);
    ctx.clear();

    new_node = graph->create(kind, num_outputs);
    for (auto *input : possibly_cast_inputs) {
      new_node->addInput(input);
    }
  } else {
    new_node = graph->create(kind, inputs, num_outputs);
  }

  if (dtype) {
    if (*dtype != at::ScalarType::Undefined) {
      new_node->s_(c10::attr::dtype, scalarTypeToOnnxString(*dtype));
    }
  }

  setNodeOutputsTypes(new_node, implicit_cast, output_type);
  return new_node;
}

torch::jit::Node *
createAndInsertNode(torch::jit::Graph *graph, torch::jit::NodeKind kind,
                    torch::jit::ArrayRef<torch::jit::Value *> inputs,
                    const ImplicitCast implicit_cast, OutputType output_type,
                    size_t num_outputs, c10::optional<at::ScalarType> dtype) {

  torch::jit::Node *new_node = createNode(graph, kind, inputs, implicit_cast,
                                          output_type, num_outputs, dtype);
  insertNodeInGraph(graph, new_node);

  return new_node;
}

torch::jit::Value *insertConstant(torch::jit::Graph *graph,
                                  const torch::jit::IValue &val) {
  return graph->insertConstant(val, current_source_location.sourceRange());
}

void setSourceRangeToCurrentLocation(torch::jit::Node *node) {
  node->setSourceRange(current_source_location.sourceRange());
}

void insertNodeInGraph(torch::jit::Graph *graph, torch::jit::Node *new_node) {
  setSourceRangeToCurrentLocation(new_node);
  graph->insertNode(new_node);
  setAvailableMemoryAddPossibleInputOp(new_node);
}

void insertNodeBeforeNode(torch::jit::Node *new_node,
                          torch::jit::Node *insert_point) {
  setSourceRangeToCurrentLocation(new_node);
  new_node->insertBefore(insert_point);
  setAvailableMemoryAddPossibleInputOp(new_node);
}

void insertNodeAfterNode(torch::jit::Node *new_node,
                         torch::jit::Node *insert_point) {
  setSourceRangeToCurrentLocation(new_node);
  new_node->insertAfter(insert_point);
  setAvailableMemoryAddPossibleInputOp(new_node);
}

// Sets the scalar types of every output of a node
void setNodeOutputsTypes(torch::jit::Node *node,
                         const ImplicitCast implicit_cast,
                         const OutputType output_type) {
  at::ScalarType resolved_output_type;

  switch (output_type) {
  case OutputType::Unknown: {
    return;
  }
  case OutputType::AsFirstInput: {
    resolved_output_type = scalarTypeFromInput(node, 0);
    break;
  }
  case OutputType::FirstAsFirstInputSecondAlwaysInt: {
    node->output(0)->setType(
        c10::TensorType::create(scalarTypeFromInput(node, 0), c10::nullopt,
                                c10::nullopt, c10::nullopt));
    node->output(1)->setType(c10::TensorType::create(
        at::ScalarType::Int, c10::nullopt, c10::nullopt, c10::nullopt));
    return;
  }
  case OutputType::AsThirdInput: {
    resolved_output_type = scalarTypeFromInput(node, 2);
    break;
  }
  case OutputType::AsImplicitCastPromoted: {
    const size_t input_idx =
        (implicit_cast == ImplicitCast::ExceptFirst) ? 1 : 0;
    resolved_output_type = scalarTypeFromInput(node, input_idx);
    break;
  }
  case OutputType::AsDtype:
    [[fallthrough]];
  case OutputType::AsDtypeOrAsPromoted: {
    // Cast uses "to" not "dtype" and a string
    if (node->kind() == symbols::popart::cast) {
      // Type is handled in OpBuilder.cpp
      return;
    }

    if (node->hasAttribute(c10::attr::dtype)) {
      if (node->kindOf(c10::attr::dtype) == torch::jit::AttributeKind::i) {
        const auto onnx_dtype = node->i(c10::attr::dtype);
        resolved_output_type = onnxStrToScalarType(
            popart_compiler::onnxStrFromDtypeInt(onnx_dtype));
      } else {
        const auto &onnx_dtype = node->s(c10::attr::dtype);
        resolved_output_type = onnxStrToScalarType(onnx_dtype.c_str());
      }
    } else {
      // Without dtype, the input will be the correct type
      resolved_output_type = scalarTypeFromInput(node, 0);
      // This may be needed in the lower to popart stage
      node->s_(c10::attr::dtype, scalarTypeToOnnxString(resolved_output_type));
    }
    break;
  }
  case OutputType::AlwaysBool: {
    resolved_output_type = at::ScalarType::Bool;
    break;
  }
  case OutputType::AlwaysFloat: {
    resolved_output_type = at::ScalarType::Float;
    break;
  }
  case OutputType::AlwaysInt: {
    resolved_output_type = at::ScalarType::Int;
    break;
  }
  case OutputType::AlwaysUint8: {
    resolved_output_type = at::ScalarType::Byte;
    break;
  }
  default: {
    ERROR("Unsupported output_type in setNodeOutputsTypes");
  }
  }

  for (auto *output : node->outputs()) {
    output->setType(c10::TensorType::create(resolved_output_type, c10::nullopt,
                                            c10::nullopt, c10::nullopt));
  }
}

torch::jit::Node *tensorToConstant(torch::jit::Graph *graph,
                                   const at::Tensor &t,
                                   UseOfNode constant_use) {
  c10::Symbol symbol;
  switch (constant_use) {
  case UseOfNode::HostSideOnly:
    symbol = symbols::poptorch::host_side_tensor_constant;
    break;
  case UseOfNode::PopARTOnly:
    symbol = symbols::poptorch::tensor_constant;
    break;
  case UseOfNode::HostSideAndPopART:
    symbol = symbols::poptorch::host_and_ipu_side_tensor_constant;
    break;
  }

  torch::jit::Node *new_node = createAndInsertNode(graph, symbol);
  new_node->output()->inferTypeFrom(t);
  setNodeTensorAttrValue(new_node, t);

  return new_node;
}

/*
 * Manually added operation.
 */
torch::jit::Node *createReshape(torch::jit::Graph *graph, torch::jit::Value *A,
                                const std::vector<int64_t> &new_shape) {
  torch::jit::Node *new_node =
      createAndInsertNode(graph, symbols::popart::reshape_static_shape, {A});
  new_node->is_(c10::attr::shape, new_shape);
  new_node->output()->setType(
      A->type()->expect<c10::TensorType>()->withSizes(new_shape));
  return new_node;
}

template <typename T, typename U>
torch::jit::Node *createConstant(torch::jit::Graph *graph,
                                 const std::vector<U> &data,
                                 const std::vector<int64_t> &new_shape,
                                 at::ScalarType scalar_type) {
  const auto total_size = static_cast<std::size_t>(std::accumulate(
      new_shape.cbegin(), new_shape.cend(), 1, std::multiplies<int64_t>()));

  size_t stride = 0;
  if (data.size() != 1) {
    ERROR_ON(total_size != data.size());
    stride = 1;
  }
  auto t = at::empty(
      {new_shape},
      at::dtype(scalar_type).memory_format(c10::MemoryFormat::Contiguous));

  auto *t_data = t.data_ptr<T>();
  for (size_t i = 0; i < total_size; i++) {
    t_data[i] = static_cast<T>(data[i * stride]); // NOLINT
  }

  return tensorToConstant(graph, t);
}

torch::jit::Node *
createConstantInt(torch::jit::Graph *graph,
                  const std::vector<std::int64_t> &data,
                  const std::vector<std::int64_t> &new_shape) {
  return createConstant<std::int32_t>(graph, data, new_shape,
                                      at::ScalarType::Int);
}

torch::jit::Node *
createConstantLong(torch::jit::Graph *graph,
                   const std::vector<std::int64_t> &data,
                   const std::vector<std::int64_t> &new_shape) {
  return createConstant<std::int64_t>(graph, data, new_shape,
                                      at::ScalarType::Long);
}

torch::jit::Node *
createConstantFloat32(torch::jit::Graph *graph, const std::vector<double> &data,
                      const std::vector<std::int64_t> &new_shape) {
  return createConstant<float>(graph, data, new_shape, at::ScalarType::Float);
}

torch::jit::Node *
createConstantFloatLike(torch::jit::Graph *graph, torch::jit::Value *t,
                        const std::vector<double> &data,
                        const std::vector<std::int64_t> &new_shape) {
  at::ScalarType const scalar_type =
      *t->type()->expect<c10::TensorType>()->scalarType();
  torch::jit::Node *new_node = createConstantFloat32(graph, data, new_shape);
  if (scalar_type == at::ScalarType::Half) {
    auto new_tensor = getNodeTensorAttrValue(new_node).to(scalar_type);
    setNodeTensorAttrValue(new_node, new_tensor);
    new_node->output()->inferTypeFrom(new_tensor);
  }
  return new_node;
}

torch::jit::Node *createInternalCast(torch::jit::Graph *graph,
                                     torch::jit::Value *A,
                                     const std::string &type) {
  // Convert from onnx string to a torch jit scalar object.
  c10::ScalarType const as_type = onnxStrToScalarType(type.c_str());

  // Create the actual cast.
  return createCast(graph, A, as_type);
}

torch::jit::Node *createCast(torch::jit::Graph *graph, torch::jit::Value *A,
                             c10::ScalarType scalar_type) {
  std::string const new_type = scalarTypeToOnnxString(scalar_type);

  auto *node = createCast(graph, {A}, new_type);

  const auto tensor_type = A->type()->expect<c10::TensorType>();
  node->output()->setType(tensor_type->withScalarType(scalar_type));
  return node;
}

static std::vector<std::int64_t>
convertPytorchPads(const std::vector<int64_t> &tensor_shape,
                   std::vector<int64_t> pad_shape) {
  // PopART requires padding for each dimension to be specified, so pad the
  // padding vector with zeros twice for each unspecified dim (one for
  // padding_before, one for padding_after)
  pad_shape.resize(tensor_shape.size() * 2, 0);

  // Converting from PyTorch to PopART requires two steps:
  // 1. Reverse the order
  // (beginN, endN, ..., begin1, end1) ->
  // (end1, begin1, ..., endN, beginN)
  std::reverse(pad_shape.begin(), pad_shape.end());
  // 2. Order padding dims by begin/end
  // (end1, begin1, ..., endN, beginN) ->
  // (begin1, ..., beginN, end1, ..., endN)
  //
  // This can be done with a single partition because begin and end
  // dims are at odd and even indices respectively. A stable partition
  // guarantees that the relative ordering of begin or end dims is unchanged
  std::stable_partition(pad_shape.begin(), pad_shape.end(),
                        [&](const int64_t &dim) {
                          auto index = &dim - std::addressof(pad_shape[0]);
                          return index % 2 == 1;
                        });

  return pad_shape;
}

torch::jit::Node *createConstantPad(torch::jit::Graph *graph,
                                    torch::jit::Value *A,
                                    const std::vector<int64_t> &pad_shape,
                                    float constant,
                                    bool direct_pad_shape_input) {

  const auto converted_pad_shape =
      direct_pad_shape_input
          ? pad_shape
          : convertPytorchPads(shapeFromTensor(A), pad_shape);

  auto *pads_tensor =
      createConstantLong(graph, converted_pad_shape,
                         {static_cast<int64_t>(converted_pad_shape.size())})
          ->output();

  auto *constant_value_tensor =
      createConstantFloat32(graph, {constant}, {1})->output();

  return createAndInsertNode(graph, symbols::poptorch::constant_pad,
                             {A, pads_tensor, constant_value_tensor},
                             ImplicitCast::None, OutputType::AsFirstInput);
}

torch::jit::Value *wrapInConstantVec(torch::jit::Graph *graph,
                                     const std::vector<int64_t> &data) {
  return createConstantInt(graph, data,
                           {static_cast<std::int64_t>(data.size())})
      ->output();
}

torch::jit::Node *createEdgePad(torch::jit::Graph *graph, torch::jit::Value *A,
                                const std::vector<int64_t> &pad_shape) {

  const auto converted_pad_shape =
      convertPytorchPads(shapeFromTensor(A), pad_shape);

  auto *pads_tensor =
      createConstantLong(graph, converted_pad_shape,
                         {static_cast<int64_t>(converted_pad_shape.size())})
          ->output();
  return createAndInsertNode(graph, symbols::poptorch::edge_pad,
                             {A, pads_tensor}, ImplicitCast::None,
                             OutputType::AsFirstInput);
  ;
}

torch::jit::Node *createReflectionPad(torch::jit::Graph *graph,
                                      torch::jit::Value *A,
                                      const std::vector<int64_t> &pad_shape) {

  const auto converted_pad_shape =
      convertPytorchPads(shapeFromTensor(A), pad_shape);
  auto *pads_tensor =
      createConstantLong(graph, converted_pad_shape,
                         {static_cast<int64_t>(converted_pad_shape.size())})
          ->output();
  return createAndInsertNode(graph, symbols::poptorch::reflection_pad,
                             {A, pads_tensor}, ImplicitCast::None,
                             OutputType::AsFirstInput);
}

torch::jit::Node *createAddNotInPlace(torch::jit::Graph *graph,
                                      torch::jit::Value *A,
                                      torch::jit::Value *B) {
  torch::jit::Node *new_node = createAndInsertNode(
      graph, symbols::poptorch::add_not_in_place, {A, B}, ImplicitCast::All,
      OutputType::AsImplicitCastPromoted);
  return new_node;
}

torch::jit::Node *
createCustomOperation(torch::jit::Graph *graph,
                      const std::vector<torch::jit::Value *> &inputs,
                      const std::string &name, const std::string &domain,
                      std::int64_t domainVersion, std::int64_t numOutputs,
                      const std::string &attributes_id_str) {
  const OutputType type =
      (numOutputs > 1) ? OutputType::Unknown : OutputType::AsFirstInput;

  torch::jit::Node *new_node =
      createAndInsertNode(graph, symbols::poptorch::custom_operation, inputs,
                          ImplicitCast::None, type, numOutputs);

  new_node->s_(c10::Symbol::attr("name"), name);
  new_node->s_(c10::Symbol::attr("domain"), domain);
  new_node->i_(c10::Symbol::attr("version"), domainVersion);
  new_node->i_(c10::Symbol::attr("num_outputs"), numOutputs);
  new_node->s_(c10::Symbol::attr("attributes_id"), attributes_id_str);

  return new_node;
}

torch::jit::Node *createAddUntypedInputTensor(torch::jit::Graph *graph,
                                              torch::jit::Value *input) {
  torch::jit::Node *new_node = createAndInsertNode(
      graph, symbols::poptorch::add_untyped_input_tensor, {input});
  return new_node;
}

torch::jit::Node *createAddOutputTensor(torch::jit::Graph *graph,
                                        torch::jit::Value *output) {
  // We explicitly don't want to add this one as we want to add it based on the
  // position of the other node.
  torch::jit::Node *new_node =
      graph->create(symbols::poptorch::addOutputTensor, {output}, 0);
  return new_node;
}

torch::jit::Node *createStartForLoop(torch::jit::Graph *graph,
                                     torch::jit::Value *inputs) {
  torch::jit::Node *new_node =
      createAndInsertNode(graph, symbols::poptorch::start_for_loop, inputs,
                          ImplicitCast::None, OutputType::Unknown, 0);
  return new_node;
}

torch::jit::Node *createEndForLoop(torch::jit::Graph *graph,
                                   torch::jit::Value *outputs,
                                   torch::jit::Value *inputs,
                                   std::int64_t trip_count) {
  torch::jit::Node *new_node = createAndInsertNode(
      graph, symbols::poptorch::end_for_loop, {outputs, inputs});
  new_node->i_(c10::Symbol::attr("trip_count"), trip_count);
  const std::size_t num_outputs = outputs->node()->inputs().size();
  new_node->i_(c10::Symbol::attr("num_outputs"), num_outputs);
  return new_node;
}

torch::jit::Node *createStartIfBlock(torch::jit::Graph *graph,
                                     torch::jit::Value *condition) {
  torch::jit::Node *new_node =
      createAndInsertNode(graph, symbols::poptorch::start_if_block, condition,
                          ImplicitCast::None, OutputType::Unknown, 0);
  return new_node;
}

torch::jit::Node *createStartElseBlock(torch::jit::Graph *graph,
                                       torch::jit::Value *outputs_then) {
  torch::jit::Node *new_node = createAndInsertNode(
      graph, symbols::poptorch::start_else_block, outputs_then,
      ImplicitCast::None, OutputType::Unknown, 0);
  return new_node;
}

torch::jit::Node *createEndIfBlock(torch::jit::Graph *graph,
                                   torch::jit::Value *outputs_else,
                                   torch::jit::Value *condition) {
  torch::jit::Node *new_node = createAndInsertNode(
      graph, symbols::poptorch::end_if_block, {outputs_else, condition});
  const std::size_t num_outputs = outputs_else->node()->inputs().size();
  new_node->i_(c10::Symbol::attr("num_outputs"), num_outputs);
  return new_node;
}

torch::jit::Node *
createRandomNormal(torch::jit::Graph *graph,
                   const std::vector<torch::jit::Value *> &possible_inputs,
                   const std::vector<int64_t> &shape, float mean, float scale,
                   at::ScalarType dataType) {
  torch::jit::Node *new_node = createAndInsertNode(
      graph, symbols::poptorch::random_normal, possible_inputs,
      ImplicitCast::All, OutputType::AsDtypeOrAsPromoted, 1, dataType);

  new_node->is_(c10::attr::shape, shape);
  new_node->f_(c10::attr::mean, mean);
  new_node->f_(c10::attr::scale, scale);

  // At this point, the input is no longer needed
  for (size_t i = 0; i < possible_inputs.size(); i++) {
    new_node->removeInput(0); // input 1 and input 0
  }

  return new_node;
}

torch::jit::Node *createRandomUniform(torch::jit::Graph *graph,
                                      torch::jit::Value *possible_input,
                                      const std::vector<int64_t> &shape,
                                      float high, float low,
                                      at::ScalarType dataType) {
  std::vector<torch::jit::Value *> inputs;
  if (possible_input != nullptr) {
    inputs.push_back(possible_input);
  }

  torch::jit::Node *new_node = createAndInsertNode(
      graph, symbols::poptorch::random_uniform, inputs, ImplicitCast::None,
      OutputType::AsDtypeOrAsPromoted, 1, dataType);

  new_node->is_(c10::attr::shape, shape);
  new_node->f_(c10::attr::high, high);
  new_node->f_(c10::attr::low, low);

  // At this point, the input is no longer needed
  if (possible_input != nullptr) {
    new_node->removeInput(0);
  }

  return new_node;
}

torch::jit::Node *createCallCpuOp(torch::jit::Graph *graph,
                                  const std::vector<torch::jit::Value *> &value,
                                  const std::string &id,
                                  torch::jit::Node *original_node) {
  const std::uint32_t num_outputs = original_node->outputs().size();
  torch::jit::Node *new_node = createAndInsertNode(
      graph, symbols::poptorch::canonicalised_cpu_call, {value},
      ImplicitCast::None, OutputType::AsDtypeOrAsPromoted, num_outputs);

  new_node->s_(c10::Symbol::attr("ID"), id);

  for (std::uint32_t i = 0; i < num_outputs; ++i) {
    torch::jit::Value *old_out = original_node->output(i);
    torch::jit::Value *new_out = new_node->output(i);

    new_out->copyMetadata(old_out);
  }

  return new_node;
}
torch::jit::Node *createSetAvailableMemory(torch::jit::Graph *graph,
                                           torch::jit::Value *value,
                                           float proportion) {
  torch::jit::Node *new_node = createAndInsertNode(
      graph, symbols::poptorch::set_available_memory, value);
  new_node->f_(c10::Symbol::attr("availableMemoryProportion"), proportion);

  new_node->output()->setType(value->type());

  return new_node;
}

torch::jit::Node *createSetAttribute(torch::jit::Graph *graph,
                                     const std::string &attribute,
                                     const std::string &key,
                                     const std::string &value,
                                     bool insert_after_insertion_pnt) {
  torch::jit::Node *new_node = nullptr;
  if (insert_after_insertion_pnt) {
    new_node = createNode(graph, symbols::poptorch::set_attribute, {},
                          ImplicitCast::None, OutputType::Unknown, 0);
    insertNodeAfterNode(new_node, graph->insertPoint());
  } else {
    new_node = createAndInsertNode(graph, symbols::poptorch::set_attribute, {},
                                   ImplicitCast::None, OutputType::Unknown, 0);
  }

  new_node->s_(c10::Symbol::attr("attribute"), attribute);
  new_node->s_(c10::Symbol::attr("key"), key);
  new_node->s_(c10::Symbol::attr("value"), value);

  return new_node;
}

torch::jit::Node *createClearAttribute(torch::jit::Graph *graph,
                                       const std::string &attribute,
                                       const std::string &key,
                                       bool insert_after_insertion_pnt) {
  torch::jit::Node *new_node = nullptr;
  if (insert_after_insertion_pnt) {
    new_node = createNode(graph, symbols::poptorch::clear_attribute, {},
                          ImplicitCast::None, OutputType::Unknown, 0);
    insertNodeAfterNode(new_node, graph->insertPoint());
  } else {
    new_node =
        createAndInsertNode(graph, symbols::poptorch::clear_attribute, {},
                            ImplicitCast::None, OutputType::Unknown, 0);
  }

  new_node->s_(c10::Symbol::attr("attribute"), attribute);
  new_node->s_(c10::Symbol::attr("key"), key);

  return new_node;
}

torch::jit::Node *createSetMatMulSerialization(torch::jit::Graph *graph,
                                               torch::jit::Value *matmul,
                                               const std::string &mode,
                                               int64_t factor,
                                               bool keep_precision) {
  torch::jit::Node *new_node = createAndInsertNode(
      graph, symbols::poptorch::set_matmul_serialization, {matmul});

  new_node->s_(c10::Symbol::attr("mode"), mode);
  new_node->i_(c10::Symbol::attr("factor"), factor);
  new_node->i_(
      c10::Symbol::attr("keep_precision"),
      static_cast<torch::jit::IntAttr::ConstructorType>(keep_precision));

  new_node->output()->setType(matmul->type());

  return new_node;
}

torch::jit::Node *createBeginIpuBlock(torch::jit::Graph *graph,
                                      std::uint64_t stage_id,
                                      std::int64_t phase, std::int64_t ipu_id) {
  torch::jit::Node *new_node = createAndInsertNode(
      graph, c10::Symbol::fromQualString("poptorch::begin_ipu_block"), {},
      ImplicitCast::None, OutputType::Unknown, 0);
  new_node->i_(c10::Symbol::attr("stage"), stage_id);
  new_node->i_(c10::Symbol::attr("phase"), phase);
  new_node->i_(c10::Symbol::attr("ipu"), ipu_id);

  return new_node;
}

torch::jit::Node *
createOptimizerGroup(torch::jit::Graph *graph, std::uint64_t group,
                     const std::vector<torch::jit::Value *> &list_of_params) {
  torch::jit::Node *new_node = createAndInsertNode(
      graph, symbols::poptorch::optimizer_group, list_of_params,
      ImplicitCast::None, OutputType::Unknown, 0);
  new_node->i_(c10::Symbol::attr("group"), group);

  return new_node;
}

torch::jit::Node *createRecomputationCheckpoint(torch::jit::Graph *graph,
                                                torch::jit::Value *value) {
  return createAndInsertNode(graph, symbols::poptorch::recomputation_checkpoint,
                             {value}, ImplicitCast::None,
                             OutputType::AsFirstInput);
}

torch::jit::Node *createUnfold(torch::jit::Graph *graph,
                               torch::jit::Value *value, int64_t dimension,
                               int64_t size, int64_t step) {
  torch::jit::Node *new_node =
      createAndInsertNode(graph, symbols::poptorch::unfold, {value},
                          ImplicitCast::None, OutputType::AsFirstInput);
  new_node->i_(c10::Symbol::fromQualString("attr::dimension"), dimension);
  new_node->i_(c10::Symbol::fromQualString("attr::size"), size);
  new_node->i_(c10::Symbol::fromQualString("attr::step"), step);

  return new_node;
}

torch::jit::Node *createMultiConvPart(torch::jit::Graph *graph,
                                      torch::jit::Node *conv_node) {
  ERROR_ON_MSG(conv_node->kind() != symbols::popart::conv,
               "Can only create multi_conv_part from conv node");

  torch::jit::Node *new_node = createAndInsertNode(
      graph, symbols::poptorch::multi_conv_part, conv_node->inputs(),
      ImplicitCast::All, OutputType::AsImplicitCastPromoted);

  new_node = new_node->copyAttributes(*conv_node);
  new_node->output()->setType(conv_node->output()->type());
  return new_node;
}

torch::jit::Node *createGru(torch::jit::Graph *graph,
                            const std::vector<torch::jit::Value *> &args,
                            int64_t hidden_size) {
  torch::jit::Node *new_node = createAndInsertNode(
      graph, symbols::poptorch::gru, args, ImplicitCast::All,
      OutputType::AsImplicitCastPromoted, 2);
  new_node->i_(c10::attr::hidden_size, hidden_size);

  return new_node;
}

torch::jit::Node *createRnn(torch::jit::Graph *graph,
                            const std::vector<torch::jit::Value *> &args,
                            const std::vector<std::string> &activations) {
  torch::jit::Node *new_node = createAndInsertNode(
      graph, symbols::poptorch::rnn, args, ImplicitCast::All,
      OutputType::AsImplicitCastPromoted, 2);
  new_node->ss_(c10::Symbol::attr("activations"), activations);
  return new_node;
}

torch::jit::Node *createPrelu(torch::jit::Graph *graph, torch::jit::Value *self,
                              torch::jit::Value *weight) {
  torch::jit::Node *new_node =
      createAndInsertNode(graph, symbols::poptorch::prelu, {self, weight},
                          ImplicitCast::None, OutputType::AsFirstInput);

  return new_node;
}

/*
 * Auto generated operation.
 */
#include "CompilerOps.cpp.inc"

} // namespace poptorch


================================================
FILE: poptorch/source/OverlappedIO.cpp
================================================
// Copyright (c) 2021 Graphcore Ltd. All rights reserved.

#include <torch/csrc/jit/ir/ir.h>

#include <algorithm>

#include "PoptorchSymbols.hpp"
#include "poptorch/DispatchTracer.hpp"
#include "poptorch/OverlappedIO.hpp"
#include "poptorch/Utils.hpp"

#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

namespace poptorch {

namespace {
void attributiseOverlappedInputs(
    torch::jit::Graph *graph,
    std::set<torch::jit::Node *> *to_erase_output_and_delete,
    std::vector<torch::jit::Node *> *to_delete) {
  logging::LogContext const ctx("attributiseOverlappedInputs");

  int64_t input_num = -1;
  for (auto *input : graph->inputs()) {
    if (isParameter(input)) {
      continue;
    }

    input_num++;
    auto input_uses = input->uses();

    if (input_uses.empty()) {
      continue;
    }

    auto &user(input_uses[0].user);

    if ((input_uses.size() == 1) &&
        (user->kind() == poptorch::symbols::poptorch::set_overlap_for_input)) {
      auto *value_node = user->input(1)->node();
      ERROR_ON(value_node->kind() != c10::prim::Constant);
      const auto &value_str = value_node->s(c10::attr::value);
      graph->param_node()->s_(getOverlapSymbol("input", input_num), value_str);
      to_delete->push_back(user);
      user->removeInput(1);

      // String constant may be shared
      if (value_node->output()->uses().empty()) {
        to_erase_output_and_delete->insert(value_node);
      }

      user->output()->replaceAllUsesWith(input);
      continue;
    }

    // This should be the only op
    for (const auto &other_use : input_uses) {
      ERROR_ON_MSG(
          other_use.user->kind() ==
              poptorch::symbols::poptorch::set_overlap_for_input,
          "poptorch.set_overlap_for_input must be the only op applied to an "
          "input. This is not the case for input "
              << input->debugName() << " to the model.");
    }
  }
}

void errorOnDoubleReturnOfOutput(torch::jit::Node *node) {
  logging::LogContext const ctx("check double return of" + nodeToString(node));
  uint32_t return_count = 0;

  std::function<void(torch::jit::Value *)> count_returns;
  count_returns = [&count_returns,
                   &return_count](torch::jit::Value *input_value) {
    for (auto use : input_value->uses()) {
      if (use.user->kind() ==
              poptorch::symbols::poptorch::set_overlap_for_output ||
          use.user->kind() == c10::prim::ListConstruct ||
          use.user->kind() == c10::prim::TupleConstruct) {
        count_returns(use.user->output());
      } else if (use.user->kind() == c10::prim::Return) {
        return_count++;
      }
    }
  };

  count_returns(node->input(0));

  ERROR_ON(return_count == 0);

  ERROR_ON_MSG(
      return_count > 1,
      "poptorch.set_overlap_for_output cannot be "
      "used with a tensor that is returned twice. Please check all returned "
      "tensors including those nested in tuples/lists.");
}

void attributiseOverlappedOutputs(
    torch::jit::Graph *graph,
    std::set<torch::jit::Node *> *to_erase_output_and_delete,
    std::vector<torch::jit::Node *> *to_delete) {
  logging::LogContext const ctx("attributiseOverlappedOutputs");

  int64_t output_num = 0;

  std::function<void(torch::jit::Node *)> process_node;
  process_node = [&process_node, graph, &output_num, to_erase_output_and_delete,
                  to_delete](torch::jit::Node *node) {
    auto overlap_symbol = getOverlapSymbol("output", output_num);
    if (node->kind() == poptorch::symbols::poptorch::set_overlap_for_output) {
      errorOnDoubleReturnOfOutput(node);

      auto *value_node = node->input(1)->node();
      ERROR_ON(value_node->kind() != c10::prim::Constant);
      const auto &value_str = value_node->s(c10::attr::value);
      graph->return_node()->s_(overlap_symbol, value_str);
      to_delete->push_back(node);
      node->removeInput(1);

      // String constant may be shared
      if (value_node->output()->uses().empty()) {
        to_erase_output_and_delete->insert(value_node);
      }

      node->output()->replaceAllUsesWith(node->input(0));
      output_num++;
    } else if (node->kind() == c10::prim::ListConstruct ||
               node->kind() == c10::prim::TupleConstruct) {
      for (auto *input : node->inputs()) {
        process_node(input->node());
      }
    } else {
      const std::string value_str = "no_overlap";
      graph->return_node()->s_(overlap_symbol, value_str);
      output_num++;
    }
  };

  // Loop over all graph (there may always only be one as multiple inputs are
  // returned as a tuple/list)
  for (auto *output : graph->outputs()) {
    process_node(output->node());
  }
}
} // namespace

void attributiseOverlappedIO(torch::jit::Graph *graph) {
  std::set<torch::jit::Node *> to_erase_output_and_delete;
  std::vector<torch::jit::Node *> to_delete;

  attributiseOverlappedInputs(graph, &to_erase_output_and_delete, &to_delete);
  attributiseOverlappedOutputs(graph, &to_erase_output_and_delete, &to_delete);

  for (auto *node : to_erase_output_and_delete) {
    node->eraseOutput(0);
    node->destroy();
  }

  for (torch::jit::Node *node : to_delete) {
    node->destroy();
  }

  // Any other use of set_overlap_for_input or set_overlap_for_input is invalid
  for (auto *node : graph->nodes()) {
    ERROR_ON_MSG(node->kind() ==
                     poptorch::symbols::poptorch::set_overlap_for_input,
                 "poptorch.set_overlap_for_input applied on a node which is "
                 "not a tensor input to the model.");

    ERROR_ON_MSG(node->kind() ==
                     poptorch::symbols::poptorch::set_overlap_for_output,
                 "poptorch.set_overlap_for_output applied on a node which is "
                 "not a tensor output to the model.");
  }
}
} // namespace poptorch


================================================
FILE: poptorch/source/PopartCanonicalization.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#include <torch/csrc/jit/ir/ir.h>

#include <optional>
#include <string>
#include <unordered_map>
#include <unordered_set>

#include "PoptorchSymbols.hpp"
#include "popart_canonicalization/PopartCanonicalizationUtils.hpp"
#include "poptorch/DispatchTracer.hpp"
#include "poptorch/InplaceOps.hpp"
#include "poptorch/OpBuilder.hpp"
#include "poptorch/PopartCanonicalization.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

namespace torch {
namespace jit {
bool isInplaceOp(const Node *node);
} // namespace jit
} // namespace torch

namespace poptorch {
namespace {

struct ReplaceInfo {
  bool allow_original_input_modifications;
  torch::jit::Value *original_input;
  torch::jit::Value *modified_input;
};

// In-place modification of slices is a special case. When we
// modify a slice in-place, torch produces a graph like the
// following:
//
//   %x = input, shape = [4, 4]
//   %1 = slice(%x), shape = [2, 2]
//   %2 = add(%1, %1)
//   %3 = slice(%x), shape = [2, 2]
//   %4 = copy_(%3, %2), shape = [2, 2]
//   return %x, shape = [4, 4]
//
// The original input %x is returned because the slice %3 is a
// view on %x, and thus any modifications to %3 are reflected
// in %x. To simulate in-place modification to slices, we return
// a dynamic update instead, so that we can perform the slice
// modification out-of-place, and return the "modified" tensor
// with the correct shape
//
//   %x = input, shape = [4, 4]
//   %1 = slice(%x), shape = [2, 2]
//   %2 = add(%1, %1)
//   %3 = dynamic_update(%x, %2) shape = [4, 4]
//   return %3, shape = [4, 4]
//
torch::jit::Node *
handleSliceModification(torch::jit::Graph *graph, torch::jit::Node *node,
                        torch::jit::Value *modified_slice,
                        std::vector<ReplaceInfo> *replace_infos) {
  torch::jit::Value *input = node->input(0);
  torch::jit::Node *new_node = modified_slice->node();

  bool replace_infos_allow_input_modification = false;

  // Follow the chain of slices that are being operated on by the inplace op
  while (input->node()->kind() == symbols::popart::slice ||
         input->node()->kind() == symbols::popart::reshape_static_shape) {

    // skip reshape_static_shape and continue scanning for slice op, example IR
    // handled in this way:
    // %1 = slice(%x)
    // %2 = popart::reshape_static_shape(%1)
    // %3 = slice(%2)
    // %4 = popart::reshape_static_shape(%3)
    // in addition, in such case original_input in replace_infos is allowed to
    // be modified during replace_infos processing
    if (input->node()->kind() == symbols::popart::reshape_static_shape) {
      input = input->node()->input(0);
      replace_infos_allow_input_modification = true;
      continue;
    }
    auto *slice = input->node();
    auto *slice_input = slice->input(0);

    // Record the indices that we sliced: We need these for DynamicUpdate
    std::vector<int64_t> slice_starts = slice->is(c10::Symbol::attr("starts"));
    std::vector<int64_t> slice_ends = slice->is(c10::Symbol::attr("ends"));
    const std::vector<int64_t> slice_dims =
        slice->is(c10::Symbol::attr("axes"));

    auto *slice_offset =
        createConstantInt(graph, slice_starts,
                          {static_cast<int64_t>(slice_starts.size())})
            ->output();

    std::vector<int64_t> sizes(slice_starts.size());
    std::transform(std::begin(slice_ends), std::end(slice_ends),
                   std::begin(slice_starts), std::begin(sizes),
                   std::minus<int64_t>());

    auto *dynamic_update =
        createDynamicupdate(graph, {slice_input, slice_offset, modified_slice},
                            slice_dims, sizes, /* noOverlap = */ 1);

    // Save the slice input and the result of the dynamic update
    // (i.e. the modified tensor) so that we can replace the original
    // inputs after PopART canonicalisation has taken place
    auto *modified_input = dynamic_update->output();
    replace_infos->push_back(
        {replace_infos_allow_input_modification, slice_input, modified_input});
    new_node = dynamic_update;

    // Repeat this process for the entire chain of slices - the
    // reconstructed modified input is used to reconstruct the next
    // modified slice input
    input = slice_input;
    modified_slice = modified_input;
  }
  // Dynamic update does not support step size. Slicing with step size is
  // implemented using subsample(slice(x))
  if (input->node()->kind() == symbols::popart::subsample) {
    auto *subsample = input->node();
    if (subsample->input(0)->node()->kind() == symbols::popart::slice) {
      ERROR("In-place modification of slices with step size other than 1 is "
            "not supported.");
    }
  }
  return new_node;
}

// Propagates half types across lists (tuple set to false) or tuples (tuple set
// to true).
// If the new node is a List/TupleConstruct, it will not, by default, have the
// types set to half when they should be, because tracing is always performed
// with floats. Use this function to rememby that on a List/Tuple construct
// after it has been created.
void propagateHalfOnListOrTupleConstruct(torch::jit::Node *n, bool tuple) {
  auto constr_type = tuple ? at::prim::TupleConstruct : at::prim::ListConstruct;
  auto unpack_type = tuple ? at::prim::TupleUnpack : at::prim::ListUnpack;

  if (n->kind() != constr_type) {
    return;
  }

  // Record which inputs were half: they would not have been on tracing but
  // would be change during canonicalization
  std::vector<bool> input_was_half;
  input_was_half.reserve(n->inputs().size());
  for (auto *input : n->inputs()) {
    // Skip if it is not a tensor or has no scalar type
    auto tensor_type = input->type()->cast<c10::TensorType>();
    if ((!tensor_type) || !tensor_type->scalarType()) {
      input_was_half.emplace_back(false);
      continue;
    }

    input_was_half.emplace_back(getNodeScalarType(input) ==
                                at::ScalarType::Half);
  }

  // Propagate types on the unpack node(s)
  for (const auto &use : n->output()->uses()) {
    torch::jit::Node *unpack = use.user;
    if (unpack->kind() != unpack_type) {
      continue;
    }

    size_t idx = 0;
    for (auto *output : unpack->outputs()) {
      // The output will be float as tracing was carried out using floats.
      if (input_was_half[idx]) {
        output->setType(
            output->type()->expect<c10::TensorType>()->withScalarType(
                c10::ScalarType::Half));
      }
      idx++;
    }
  }
}

class CanonicalizeImpl {
public:
  static void run(torch::jit::Graph *graph);
};

/*
 * ConvertAtenToPopart implementation.
 */

void CanonicalizeImpl::run(torch::jit::Graph *graph) {
  logging::LogContext const ctx_func("PopartCanonicalization");
  std::vector<ReplaceInfo> replace_infos;
  for (torch::jit::Node *node : graph->nodes()) {
    logging::LogContext const ctx("processing " + nodeToString(node));
    const WithNodeMetadata metadata(node);
    torch::jit::WithInsertPoint const insert_point(node);
    torch::jit::Node *new_node = nullptr;
    torch::jit::Symbol const kind = node->kind();

    if (const SymbolHandler handler = getHandler(kind)) {
      new_node = handler(graph, node);

      const bool was_inplace_op_on_view =
          node->hasAttributeS("was_inplace_on_view") &&
          node->i(c10::Symbol::attr("was_inplace_on_view")) == 1;

      if (was_inplace_op_on_view || torch::jit::isInplaceOp(node)) {
        new_node = handleSliceModification(graph, node, new_node->output(),
                                           &replace_infos);
      }
    }

    // If we have a new node add it and replace the old use.
    if (new_node != nullptr) {
      // Mark this node for deletion.
      markNodeForDeletion(node);

      if (node->hasUses()) {
        for (std::uint64_t i = 0; i < node->outputs().size(); ++i) {
          if (i >= new_node->outputs().size()) {
            ERROR_ON_MSG(
                node->output(i)->hasUses(),
                "The canonicalised JIT node has fewer outputs than the "
                "dispatch function. This is only an issue because these "
                "outputs are used.");
            continue;
          }

          // As well as replacing the use, this will copy across shape/type
          // if not explicitly set.
          replaceOutputUse(node, new_node, i);
        }

        // Propagate half types across ListConstructs and TupleConstructs
        propagateHalfOnListOrTupleConstruct(new_node, true);
        propagateHalfOnListOrTupleConstruct(new_node, false);
      }
    }
  }

  // Replace slice inputs with their modified counterparts
  for (auto curr_info_iter = replace_infos.begin();
       curr_info_iter != replace_infos.end(); ++curr_info_iter) {

    curr_info_iter->original_input->replaceAllUsesAfterNodeWith(
        curr_info_iter->modified_input->node(), curr_info_iter->modified_input);

    for (auto next_info_iter = curr_info_iter + 1;
         next_info_iter != replace_infos.end(); ++next_info_iter) {
      // if original input modification is allowed this code will modify
      // subsequent replace infos if original inputs are the same.
      //
      // example:
      //     replace_infos[0] = {x, // don't care
      //                         original_input = %1,
      //                         modified_input = %2}
      //     replace_infos[1] = {false,
      //                         original_input = %1,
      //                         modified_input = %3}
      //     replace_infos[2] = {true,
      //                         original_input = %1,
      //                         modified_input = %4}
      //
      // >>>> will modify replace info struct at index 2:
      //
      //     replace_infos[1] = {false,
      //                         original_input = %1,
      //                         modified_input = %3}
      //     replace_infos[2] = {true,
      //                         original_input = %2, <-- was %1
      //                         modified_input = %4}
      if (next_info_iter->allow_original_input_modifications &&
          curr_info_iter->original_input == next_info_iter->original_input) {
        next_info_iter->original_input = curr_info_iter->modified_input;
      }
    }
  }

  // Build a list of nodes marked for deletion.
  std::unordered_set<torch::jit::Node *> to_delete;
  for (torch::jit::Node *node : graph->nodes()) {
    if (isMarkedForDeletion(node)) {
      to_delete.insert(node);
    }
  }

  // Remove the dead nodes.
  searchAndPossiblyDestroy(to_delete);
}

} // namespace

void canonicalize(torch::jit::Graph *graph) {
  const CanonicalizeImpl converter;
  converter.run(graph);
}
} // namespace poptorch


================================================
FILE: poptorch/source/PopartLateCanonicalization.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.

#include <torch/csrc/jit/ir/ir.h>

#include <functional>
#include <queue>

#include "PoptorchSymbols.hpp"
#include "popart_canonicalization/PopartCanonicalizationUtils.hpp"
#include "poptorch/OpBuilder.hpp"
#include "poptorch/PopartCanonicalization.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

namespace poptorch {

using FunctionTy = std::function<void()>;

// broadcast the scalar option vector to match num_convs
template <typename T> void broadcast(std::vector<T> &option, size_t num_convs) {
  if (option.size() != 1 || num_convs == 1) {
    return;
  }

  option.insert(option.end(), num_convs - 1, option[0]);
}

class MultiConvHandler {
public:
  explicit MultiConvHandler(torch::jit::Graph *g) : _graph(g) {}

  bool inMultiConv() const { return _in_multi_conv; }

  void begin(torch::jit::Node *node) {
    ERROR_ON_MSG(inMultiConv(), "Nested poptorch.MultiConv is not supported.");
    _in_multi_conv = true;
    _to_delete.insert(node);
  }

  void part(torch::jit::Node *node) { _parts.push_back(node); }

  FunctionTy end(torch::jit::Node *node) {
    ERROR_ON_MSG(!inMultiConv() || _parts.empty(),
                 "Unexpected end_multi_conv, is the IR malformed?");
    _in_multi_conv = false;
    applyOptions(node);
    _parts_queue.push(_parts);
    _parts.clear();
    return [this, node]() { applyPartLinks(node); };
  }

  void cleanup() { searchAndPossiblyDestroy(_to_delete); }

private:
  void applyOptions(torch::jit::Node *end_node) {
    // Fold any supplied options as attributes of the end_node.
    // Mark all options for deletion when we cleanup the IR.
    // available_memory_proportions
    torch::jit::Node *available_mem_props = end_node->input(0)->node();
    _to_delete.insert(available_mem_props);
    if (!isNone(available_mem_props)) {
      std::vector<double> vals =
          constantListToVec<double>(available_mem_props, constantToFloat);
      broadcast(vals, _parts.size());
      end_node->fs_(c10::Symbol::attr("available_memory_proportions"), vals);
    }

    // partials_types
    torch::jit::Node *partials_types = end_node->input(1)->node();
    _to_delete.insert(partials_types);
    if (!isNone(partials_types)) {
      std::vector<int64_t> vals = constantToLongVec(partials_types);
      broadcast(vals, _parts.size());
      end_node->is_(c10::Symbol::attr("partials_types"), vals);
    }

    // plan_type
    torch::jit::Node *plan_type = end_node->input(2)->node();
    _to_delete.insert(plan_type);
    if (!isNone(plan_type)) {
      end_node->i_(c10::Symbol::attr("plan_type"), constantToLong(plan_type));
    }

    // per_conv_reserved_tiles
    torch::jit::Node *reserved_tiles = end_node->input(3)->node();
    _to_delete.insert(reserved_tiles);
    if (!isNone(reserved_tiles)) {
      end_node->i_(c10::Symbol::attr("per_conv_reserved_tiles"),
                   constantToLong(reserved_tiles));
    }

    // cycle_back_off
    torch::jit::Node *back_off = end_node->input(4)->node();
    _to_delete.insert(back_off);
    if (!isNone(back_off)) {
      end_node->f_(c10::Symbol::attr("cycle_back_off"),
                   constantToFloat(back_off));
    }

    // enable_conv_dithering
    torch::jit::Node *enable_conv_ditherings = end_node->input(5)->node();
    _to_delete.insert(enable_conv_ditherings);
    if (!isNone(enable_conv_ditherings)) {
      std::vector<int64_t> vals = constantToLongVec(enable_conv_ditherings);
      broadcast(vals, _parts.size());
      end_node->is_(c10::Symbol::attr("enable_conv_ditherings"), vals);
    }

    // Clear all the options from the end node inputs as they are now
    // incorporated as node attributes
    end_node->removeAllInputs();
  }

  void applyPartLinks(torch::jit::Node *end_node) {
    // Swaps out conv nodes with multi_conv_part which are then linked to the
    // end_node.  Each conv output flows through the end_multi_conv instruction.
    uint64_t num_outputs = 0;

    // Track the earliest user for the multiconv outputs
    torch::jit::Node *earliest_user = nullptr;

    for (torch::jit::Node *node : _parts_queue.front()) {
      // Create the multi_conv_part node and insert it after the original conv
      WithNodeMetadata meta(node);
      torch::jit::Node *conv_part = createMultiConvPart(_graph, node);
      conv_part->moveAfter(node);
      _to_delete.insert(node);

      // Attach the multi_conv_part to the end_multi_conv instruction.
      end_node->addInput(conv_part->output());
      torch::jit::Value *output_i = end_node->addOutput();
      output_i->setType(conv_part->output()->type());
      replaceOutputUse(node->output(), end_node->output(num_outputs));

      // Keep track of the first node that consumes the multiconv outputs
      torch::jit::Node *output_user = findEarliestUser(output_i);

      if ((earliest_user == nullptr) || earliest_user->isAfter(output_user)) {
        earliest_user = output_user;
      }

      num_outputs++;
    }

    _parts_queue.pop();

    if (end_node->isBefore(earliest_user)) {
      // All good, nothing further to do here
      return;
    }

    // Move the end_multi_conv instruction directly before its first consumer
    // and check for any dependency violations that might have been introduced.
    end_node->moveBefore(earliest_user);
    torch::jit::node_list checklist{end_node};

    while (!checklist.empty()) {
      torch::jit::Node *consumer = checklist.back();
      checklist.pop_back();

      for (torch::jit::Value *value : consumer->inputs()) {
        torch::jit::Node *producer = value->node();

        // Fix any topological ordering violations and check any moved nodes
        if (producer->isAfter(consumer)) {
          producer->moveBefore(consumer);
          checklist.push_back(producer);
        }
      }
    }
  }

  torch::jit::Graph *_graph;
  std::unordered_set<torch::jit::Node *> _to_delete;
  torch::jit::node_list _parts;
  std::queue<torch::jit::node_list> _parts_queue;
  bool _in_multi_conv = false;
};

// Reorders set_matmul_serialization and reshape if required
FunctionTy reorderMatmulSeralisationIfRequired(torch::jit::Node *node) {
  return [node]() {
    ERROR_ON(node->kind() != symbols::poptorch::set_matmul_serialization);

    auto *reshape_node = node->input()->node();

    // If the input is a matmul, no reordering is necessary
    if (reshape_node->kind() == symbols::popart::matmul) {
      return;
    }

    ERROR_ON(reshape_node->kind() != symbols::popart::reshape_static_shape);
    ERROR_ON(reshape_node->input()->node()->kind() != symbols::popart::matmul);

    // Change matmul -> reshape -> set_matmul_seralization
    // to matmul -> set_matmul_seralization -> reshape
    node->moveBefore(reshape_node);

    node->replaceInput(0, reshape_node->input());
    // matmul -> reshape
    //  \-> set_matmul_seralization -> ...

    node->output()->replaceAllUsesWith(reshape_node->output());
    // matmul -> reshape  -> ...
    //  \-> set_matmul_seralization

    reshape_node->replaceInput(0, node->output());
    // matmul -> set_matmul_seralization -> reshape -> ...
  };
}

void canonicalizeLate(torch::jit::Graph *graph) {
  logging::LogContext ctx_func("canonicalizeLate");
  /*
   * Perform the operation by looking for nodes we know need to be patched and
   * add the patching code to the callback which then all get called at once.
   * (To perserve the iterators.)
   */
  std::vector<FunctionTy> callbacks;
  MultiConvHandler multi_conv_handler(graph);

  // Look for the nodes.
  for (torch::jit::Node *node : graph->nodes()) {
    logging::LogContext ctx("Processing " + nodeToString(node));
    const torch::jit::Symbol kind = node->kind();

    if (kind == symbols::poptorch::begin_multi_conv) {
      multi_conv_handler.begin(node);
    } else if (multi_conv_handler.inMultiConv() &&
               kind == symbols::popart::conv) {
      multi_conv_handler.part(node);
    } else if (kind == symbols::poptorch::end_multi_conv) {
      callbacks.emplace_back(multi_conv_handler.end(node));
    } else if (kind == symbols::poptorch::push_name_scope) {
      std::string name = constantToString(node->input(0)->node());
      node->s_(c10::Symbol::attr("name"), name);
      // Remove inputs converted to attributes.
      callbacks.emplace_back(
          [node]() { removeAndPossiblyDestroyAllInputs(node); });
    } else if (kind == symbols::poptorch::set_attribute) {
      if (node->inputs().empty()) {
        continue;
      }
      std::string attribute = constantToString(node->input(0)->node());
      std::string key = constantToString(node->input(1)->node());
      std::string value = constantToString(node->input(2)->node());
      // Remove inputs converted to attributes.
      callbacks.emplace_back(
          [node]() { removeAndPossiblyDestroyAllInputs(node); });

      node->s_(c10::Symbol::attr("attribute"), attribute);
      node->s_(c10::Symbol::attr("key"), key);
      node->s_(c10::Symbol::attr("value"), value);
    } else if (kind == symbols::poptorch::clear_attribute) {
      if (node->inputs().empty()) {
        continue;
      }
      std::string attribute = constantToString(node->input(0)->node());
      std::string key = constantToString(node->input(1)->node());
      node->s_(c10::Symbol::attr("attribute"), attribute);
      node->s_(c10::Symbol::attr("key"), key);
      // Remove inputs converted to attributes.
      callbacks.emplace_back(
          [node]() { removeAndPossiblyDestroyAllInputs(node); });
    } else if (kind == symbols::poptorch::set_matmul_serialization) {
      callbacks.emplace_back(reorderMatmulSeralisationIfRequired(node));
    } else if (kind == symbols::poptorch::set_available_memory) {
      callbacks.emplace_back(
          [node]() { moveSetAvailableMemoryIfRequired(node); });
    }
  }

  // Execute the patchups.
  for (auto &callback : callbacks) {
    callback();
  }

  multi_conv_handler.cleanup();
}

} // namespace poptorch


================================================
FILE: poptorch/source/PoplarExecutable.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.

#include <ATen/ATen.h>

#include <iostream>
#include <sstream>
#include <string>

#include "poptorch/DispatchTracer.hpp"
#include "poptorch/InplaceOps.hpp"
#include "poptorch/PoplarExecutable.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

namespace poptorch {

void PoplarExecutable::updateOptimizers(
    const std::vector<popart_compiler::Optimizer> &optimizers) {
  _compiler.updateOptimizers(optimizers);
}

std::vector<at::IValue>
PoplarExecutable::run(std::vector<at::Tensor> &inTensors) {
  const std::vector<at::Tensor> tensor_views;

  // Set up the input tensors in the poplar graph to point to the incoming
  // pytorch tensors.
  for (std::size_t i = 0; i < _popart_inputs.size(); ++i) {
    popart_compiler::TensorId const popart_id = _popart_inputs[i];
    const at::Tensor &pytorch_tensor = inTensors.at(i);

    ERROR_ON(!pytorch_tensor.is_contiguous());

    // Convert to correct data type.
    std::vector<std::int64_t> popart_dims(pytorch_tensor.sizes().size());
    std::transform(pytorch_tensor.sizes().begin(), pytorch_tensor.sizes().end(),
                   popart_dims.begin(), [](std::int64_t j) { return j; });

    // Handle input based on the PyTorch input type
    at::ScalarType const elem_type = pytorch_tensor.scalar_type();

    void *data_ptr = nullptr;
    if (pytorch_tensor.is_cpu()) {
      data_ptr = pytorch_tensor.data_ptr();
    } else {
      data_ptr = getDataSource(pytorch_tensor);
    }
    ERROR_ON(data_ptr == nullptr);

    switch (elem_type) {
    case at::ScalarType::Byte:
      _compiler.setUpInputOp(popart_id, static_cast<std::uint8_t *>(data_ptr),
                             popart_dims);
      break;
    case at::ScalarType::Char:
      _compiler.setUpInputOp(popart_id, static_cast<std::int8_t *>(data_ptr),
                             popart_dims);
      break;
    case at::ScalarType::Float:
      _compiler.setUpInputOp(popart_id, static_cast<float *>(data_ptr),
                             popart_dims);
      break;
    case at::ScalarType::Half:
      _compiler.setUpInputOp(popart_id, static_cast<std::int16_t *>(data_ptr),
                             popart_dims, true);
      break;
    case at::ScalarType::Short:
      _compiler.setUpInputOp(popart_id, static_cast<std::int16_t *>(data_ptr),
                             popart_dims);
      break;
    case at::ScalarType::Int:
      _compiler.setUpInputOp(popart_id, static_cast<std::int32_t *>(data_ptr),
                             popart_dims);
      break;
    case at::ScalarType::Bool:
      _compiler.setUpInputOp(popart_id, static_cast<bool *>(data_ptr),
                             popart_dims);
      break;
    case at::ScalarType::Long:
      // If it's an IPU tensor then it should have been handled by the
      // dispatcher.
      ERROR_ON_MSG(!pytorch_tensor.is_cpu(), "Only supported for CPU tensors");
      _converted_inputs[i] = pytorch_tensor.toType(at::ScalarType::Int);
      _compiler.setUpInputOp(
          popart_id,
          static_cast<std::int32_t *>(_converted_inputs[i].data_ptr()),
          popart_dims);
      break;
    case at::ScalarType::Double:
    case at::ScalarType::BFloat16:
      // If it's an IPU tensor then it should have been handled by the
      // dispatcher.
      ERROR_ON_MSG(!pytorch_tensor.is_cpu(), "Only supported for CPU tensors");
      _converted_inputs[i] = pytorch_tensor.toType(at::ScalarType::Float);
      _compiler.setUpInputOp(
          popart_id, static_cast<float *>(_converted_inputs[i].data_ptr()),
          popart_dims);
      break;
    default:
      ERROR("Unsupported input type torch." << c10::toString(elem_type));
    }
  }

  // Temp buffers for the output state.
  std::vector<at::IValue> returnees;
  returnees.reserve(_popart_outputs.size());

  // Set up the outputs.
  for (size_t i = 0; i < _popart_outputs.size(); i++) {
    const popart_compiler::TensorId &popart_id(_popart_outputs[i]);
    auto dims = _compiler.getSize(popart_id);
    ERROR_ON_MSG(dims == popart_compiler::Compiler::invalid_size,
                 "Shape inference failed");

    std::uint64_t const b_dim = _compiler.popartBatchDimForAnchor(popart_id);
    if (b_dim > 1) {
      // Treat scalars as 1D tensors if necessary for batching.
      if (dims.empty()) {
        dims.push_back(1);
      }
      // Adjust by the popart batch dim, accounting for the anchor.
      dims[0] *= b_dim;
    }

    // Create the torch tensor and use its memory for the popart tensor.
    at::ScalarType const type = _popart_output_types[i];
    returnees.emplace_back(at::empty(
        {dims}, at::dtype(type).memory_format(c10::MemoryFormat::Contiguous)));

    auto *data_ptr = returnees.back().toTensor().data_ptr();

    switch (type) {
    case at::ScalarType::Byte:
      _compiler.setUpOutputOp(popart_id, static_cast<std::uint8_t *>(data_ptr),
                              dims);
      break;
    case at::ScalarType::Char:
      _compiler.setUpOutputOp(popart_id, static_cast<std::int8_t *>(data_ptr),
                              dims);
      break;
    case at::ScalarType::Float:
      _compiler.setUpOutputOp(popart_id, static_cast<float *>(data_ptr), dims);
      break;
    case at::ScalarType::Half:
    case at::ScalarType::Short:
      _compiler.setUpOutputOp(popart_id, static_cast<std::int16_t *>(data_ptr),
                              dims);
      break;
    case at::ScalarType::Int:
      _compiler.setUpOutputOp(popart_id, static_cast<std::int32_t *>(data_ptr),
                              dims);
      break;
    case at::ScalarType::Bool:
      _compiler.setUpOutputOp(popart_id, static_cast<bool *>(data_ptr), dims);
      break;
    default:
      ERROR("Unexpected type returned from popart");
    }
  }

  // Execute the compiled poplar graph.
  _compiler.run();

  const auto &mapping = _inplace_info.input_output_mapping;
  for (size_t i = 0; i < mapping.size(); i++) {
    if (mapping[i] == InplaceGraphInfo::no_mapping) {
      continue;
    }
    auto out_tensor = returnees.at(mapping[i]).toTensor();
    inTensors.at(i).copy_(out_tensor, false);
  }

  returnees.resize(_inplace_info.num_tensor_outputs);

  return returnees;
}

void PoplarExecutable::loadEngineAndConnectStreams() {
  if (!_compiler.isAttachedToDevice()) {
    _compiler.attachToDevice();
  }
  _compiler.loadEngineAndConnectStreams();
}

// Tell popart to copy weights off the IPU and write into host memory.
void PoplarExecutable::copyWeightsToHost(
    const std::map<std::string, void *> &buffers) {
  std::vector<void *> pointers;
  pointers.reserve(_parameter_names.size());
  for (const std::string &name : _parameter_names) {
    pointers.push_back(buffers.at(name));
  }
  _compiler.copyWeightsToHost(pointers);
}

// Tell popart to copy weights from host into IPU memory.
void PoplarExecutable::copyWeightsToDevice(
    const std::map<std::string, void *> &buffers) {
  std::vector<void *> pointers;
  pointers.reserve(_parameter_names.size());
  for (const std::string &name : _parameter_names) {
    pointers.push_back(buffers.at(name));
  }
  _compiler.copyWeightsToDevice(pointers);
}

// Tell popart to copy named buffers from host into IPU memory.
void PoplarExecutable::copyNamedBuffersToDevice(
    const std::map<std::string, void *> &buffers) {
  std::vector<void *> pointers;
  pointers.reserve(buffers.size());
  for (const auto &buffer : buffers) {
    pointers.push_back(buffer.second);
  }
  _compiler.copyNamedBuffersToDevice(pointers);
}

const std::vector<popart_compiler::OutputTypeShape> &
PoplarExecutable::outputTypes() const {
  return _compiler.outputTypes();
}

std::string PoplarExecutable::getPopartIR() const {
  auto managed_ptr = _compiler.getPopartIR();

  const char *raw_ptr = static_cast<const char *>(managed_ptr.get());

  // Convert to std::string, copying again.
  return raw_ptr;
}

std::set<std::string> PoplarExecutable::getTensorNames() const {
  std::set<std::string> casted_ids;

  const auto tensor_ids = _compiler.getTensorNames();
  for (const auto &tensor_id : tensor_ids) {
    const char *raw_ptr = static_cast<const char *>(tensor_id.get());
    // Convert to std::string, copying again.
    casted_ids.insert(raw_ptr);
  }

  return casted_ids;
}

void PoplarExecutable::detachFromDevice() { _compiler.detachFromDevice(); }

void PoplarExecutable::attachToDevice() { _compiler.attachToDevice(); }

bool PoplarExecutable::isAttachedToDevice() const {
  return _compiler.isAttachedToDevice();
}

} // namespace poptorch


================================================
FILE: poptorch/source/PoptorchStaticInit.hpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#ifndef SOURCE_POPTORCH_STATIC_INIT_H
#define SOURCE_POPTORCH_STATIC_INIT_H

// The constants below set priorities for constructor functions used to
// initialize static data. Functions with lower numbers run first.

// Priority value for symbol initialisation functions
#define SYMBOL_INIT_PRIORITY 101

// Priority value for shape inference registration functions
#define SHAPE_INFERENCE_INIT_PRIORITY 102

// Priority value for handler registration functions
#define HANDLER_INIT_PRIORITY 103

#endif


================================================
FILE: poptorch/source/PoptorchSymbols.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved
#include <spdlog/fmt/fmt.h>
#include <spdlog/fmt/ostr.h>

#include "PoptorchStaticInit.hpp"
#include "PoptorchSymbols.hpp"
#include "poptorch_logging/Logging.hpp"

#define SYMBOL_INIT(Namespace, Name)                                           \
  Name = c10::Symbol::fromQualString(#Namespace "::" #Name)

namespace c10::aten {

c10::Symbol multilabel_soft_margin_loss; // NOLINT

// clang-format off
__attribute__((constructor(SYMBOL_INIT_PRIORITY)))
static void initializeAtenSymbols() {
  // clang-format on
  poptorch::logging::trace("Initializing aten symbols");
  SYMBOL_INIT(aten, multilabel_soft_margin_loss);
}

} // namespace c10::aten

namespace torch_scatter {

c10::Symbol scatter_max; // NOLINT
c10::Symbol scatter_min; // NOLINT
c10::Symbol scatter_mul; // NOLINT

// clang-format off
__attribute__((constructor(SYMBOL_INIT_PRIORITY)))
static void initializeTorchScatterSymbols() {
  // clang-format on
  poptorch::logging::trace("Initializing torch_scatter symbols");
  SYMBOL_INIT(torch_scatter, scatter_max);
  SYMBOL_INIT(torch_scatter, scatter_min);
  SYMBOL_INIT(torch_scatter, scatter_mul);
}

} // namespace torch_scatter

namespace torch_cluster {

c10::Symbol grid; // NOLINT

// clang-format off
__attribute__((constructor(SYMBOL_INIT_PRIORITY)))
static void initializeTorchScatterSymbols() {
  // clang-format on
  poptorch::logging::trace("Initializing torch_scatter symbols");
  SYMBOL_INIT(torch_cluster, grid);
}

} // namespace torch_cluster

namespace torch_spline_conv {

c10::Symbol spline_basis;     // NOLINT
c10::Symbol spline_weighting; // NOLINT

// clang-format off
  __attribute__((constructor(SYMBOL_INIT_PRIORITY)))
  static void initializeTorchSplineConvSymbols() {
  // clang-format on
  poptorch::logging::trace("Initializing torch_spline_conv symbols");
  SYMBOL_INIT(torch_spline_conv, spline_basis);
  SYMBOL_INIT(torch_spline_conv, spline_weighting);
}

} // namespace torch_spline_conv

namespace poptorch {
namespace symbols {

#define OP_DECL(Namespace, FuncName, function, OnnxImpl, Args, BodyArgs)       \
  c10::Symbol Namespace::FuncName;

#define OP_DECL_NO_RETURN(Namespace, FuncName, function, OnnxImpl, Args,       \
                          BodyArgs)                                            \
  c10::Symbol Namespace::FuncName;

#include "popart_compiler/SupportedOperations.inc.hpp" // NOLINT

#undef OP_DECL
#undef OP_DECL_NO_RETURN
// clang-format off
__attribute__((constructor(SYMBOL_INIT_PRIORITY)))
static void initializeSupportedOperations() {
  // clang-format on
  logging::trace("Initializing supported operationss");

#define OP_DECL(Namespace, FuncName, function, OnnxImpl, Args, BodyArgs)       \
  Namespace::FuncName =                                                        \
      c10::Symbol::fromQualString(#Namespace "::" #FuncName); // NOLINT

#define OP_DECL_NO_RETURN(Namespace, FuncName, function, OnnxImpl, Args,       \
                          BodyArgs)                                            \
  Namespace::FuncName =                                                        \
      c10::Symbol::fromQualString(#Namespace "::" #FuncName); // NOLINT

#include "popart_compiler/SupportedOperations.inc.hpp" // NOLINT

#undef OP_DECL
#undef OP_DECL_NO_RETURN
}

namespace poptorch {

c10::Symbol nop;
c10::Symbol dynamic_slice;
c10::Symbol dynamic_update;
c10::Symbol begin_ipu_block;
c10::Symbol internal_cast;
c10::Symbol end_ipu_block;
c10::Symbol identity_loss;
c10::Symbol set_available_memory;
c10::Symbol set_matmul_serialization;
c10::Symbol set_overlap_for_input;
c10::Symbol set_overlap_for_output;
c10::Symbol optimizer_group;
c10::Symbol begin_multi_conv;
c10::Symbol multi_conv_part;
c10::Symbol end_multi_conv;

c10::Symbol update_param_inplace;

c10::Symbol host_side_cast;

c10::Symbol start_for_loop;
c10::Symbol end_for_loop;

c10::Symbol start_if_block;
c10::Symbol start_else_block;
c10::Symbol end_if_block;

c10::Symbol push_name_scope;
c10::Symbol pop_name_scope;
c10::Symbol add_untyped_input_tensor;

c10::Symbol host_and_ipu_side_tensor_constant;

c10::Symbol call_cpu_op;
c10::Symbol end_cpu_op;

c10::Symbol canonicalised_cpu_call;
c10::Symbol ctc_beam_search_decoder;

c10::Symbol set_attribute;
c10::Symbol clear_attribute;

c10::Symbol fps;
c10::Symbol nearest;
c10::Symbol nearest_batch_list;

// clang-format off
__attribute__((constructor(SYMBOL_INIT_PRIORITY)))
static void initializePoptorchSymbols() {
  // clang-format on
  logging::trace("Initializing poptorch symbols");
  SYMBOL_INIT(poptorch, nop);
  SYMBOL_INIT(poptorch, dynamic_slice);
  SYMBOL_INIT(poptorch, dynamic_update);
  SYMBOL_INIT(poptorch, begin_ipu_block);
  SYMBOL_INIT(poptorch, internal_cast);
  SYMBOL_INIT(poptorch, end_ipu_block);
  SYMBOL_INIT(poptorch, identity_loss);
  SYMBOL_INIT(poptorch, set_available_memory);
  SYMBOL_INIT(poptorch, set_matmul_serialization);
  SYMBOL_INIT(poptorch, set_overlap_for_input);
  SYMBOL_INIT(poptorch, set_overlap_for_output);
  SYMBOL_INIT(poptorch, optimizer_group);
  SYMBOL_INIT(poptorch, begin_multi_conv);
  SYMBOL_INIT(poptorch, multi_conv_part);
  SYMBOL_INIT(poptorch, end_multi_conv);
  SYMBOL_INIT(poptorch, host_side_cast);

  SYMBOL_INIT(poptorch, update_param_inplace);

  SYMBOL_INIT(poptorch, start_for_loop);
  SYMBOL_INIT(poptorch, end_for_loop);

  SYMBOL_INIT(poptorch, start_if_block);
  SYMBOL_INIT(poptorch, start_else_block);
  SYMBOL_INIT(poptorch, end_if_block);

  SYMBOL_INIT(poptorch, push_name_scope);
  SYMBOL_INIT(poptorch, pop_name_scope);
  SYMBOL_INIT(poptorch, add_untyped_input_tensor);

  SYMBOL_INIT(poptorch, host_and_ipu_side_tensor_constant);

  SYMBOL_INIT(poptorch, call_cpu_op);
  SYMBOL_INIT(poptorch, end_cpu_op);

  SYMBOL_INIT(poptorch, canonicalised_cpu_call);
  SYMBOL_INIT(poptorch, ctc_beam_search_decoder);

  SYMBOL_INIT(poptorch, set_attribute);
  SYMBOL_INIT(poptorch, clear_attribute);

  SYMBOL_INIT(poptorch, fps);
  SYMBOL_INIT(poptorch, nearest);
  SYMBOL_INIT(poptorch, nearest_batch_list);
}

} // namespace poptorch
} // namespace symbols

c10::Symbol getOverlapSymbol(const char *suffix, unsigned int num) {
  return c10::Symbol::attr(
      fmt::format("poptorch_overlap_for_{}{}", suffix, num));
}

} // namespace poptorch


================================================
FILE: poptorch/source/PoptorchSymbols.hpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#ifndef SOURCE_POPTORCH_SYMBOLS_H
#define SOURCE_POPTORCH_SYMBOLS_H
#include <ATen/core/interned_strings.h>
#include <torch/csrc/jit/ir/ir.h>

// Create missing C10 symbols.
// PyTorch initialises aten Symbols from native_functions.yml (see
// `aten_interned_strings.h`, and `gen_interned_strings` in torchgen). However,
// not all the aten Symbols we need are present in native_functions.yml.
namespace c10::aten {
extern c10::Symbol multilabel_soft_margin_loss; // NOLINT

} // namespace c10::aten

namespace poptorch {

namespace symbols {

#define OP_DECL(Namespace, FuncName, function, OnnxImpl, Args, BodyArgs)       \
  namespace Namespace {                                                        \
  extern c10::Symbol FuncName;                                                 \
  }

#define OP_DECL_NO_RETURN(Namespace, FuncName, function, OnnxImpl, Args,       \
                          BodyArgs)                                            \
  namespace Namespace {                                                        \
  extern c10::Symbol FuncName;                                                 \
  }

#include "popart_compiler/SupportedOperations.inc.hpp"

#undef OP_DECL
#undef OP_DECL_NO_RETURN
} // namespace symbols

namespace symbols::poptorch {
extern c10::Symbol nop;
extern c10::Symbol dynamic_slice;
extern c10::Symbol dynamic_update;
extern c10::Symbol begin_ipu_block;
extern c10::Symbol internal_cast;
extern c10::Symbol end_ipu_block;
extern c10::Symbol identity_loss;
extern c10::Symbol set_available_memory;
extern c10::Symbol set_matmul_serialization;
extern c10::Symbol set_overlap_for_input;
extern c10::Symbol set_overlap_for_output;
extern c10::Symbol optimizer_group;
extern c10::Symbol begin_multi_conv;
extern c10::Symbol multi_conv_part;
extern c10::Symbol end_multi_conv;

// In order to allow a paramater/buffer to be updated in place, the only
// guaranteed inplace op by PopART, use update_param_inplace.
extern c10::Symbol update_param_inplace;

// Casting is done before passing the input to the IPU: the op is used so that
// so that that input types match those received from pytorch but that the input
// types to later ops have the correct type.
extern c10::Symbol host_side_cast;

extern c10::Symbol start_for_loop;
extern c10::Symbol end_for_loop;
extern c10::Symbol start_if_block;
extern c10::Symbol start_else_block;
extern c10::Symbol end_if_block;
extern c10::Symbol push_name_scope;
extern c10::Symbol pop_name_scope;
extern c10::Symbol add_untyped_input_tensor;
extern c10::Symbol host_and_ipu_side_tensor_constant;
extern c10::Symbol call_cpu_op;
extern c10::Symbol end_cpu_op;
extern c10::Symbol canonicalised_cpu_call;
extern c10::Symbol ctc_beam_search_decoder;
extern c10::Symbol set_attribute;
extern c10::Symbol clear_attribute;

extern c10::Symbol unfold;
extern c10::Symbol prelu;
extern c10::Symbol fps;
extern c10::Symbol nearest;
extern c10::Symbol nearest_batch_list;
} // namespace symbols::poptorch

// Return the attribute symbol refering to having overlap for a given input
c10::Symbol getOverlapSymbol(const char *suffix, unsigned int num);

} // namespace poptorch

// Define symbols used by PyG torch_scatter library
namespace torch_scatter {
extern c10::Symbol scatter_max;
extern c10::Symbol scatter_min;
extern c10::Symbol scatter_mul;
} // namespace torch_scatter

namespace torch_cluster {
extern c10::Symbol grid;
} // namespace torch_cluster

namespace torch_spline_conv {
extern c10::Symbol spline_basis;
extern c10::Symbol spline_weighting;
} // namespace torch_spline_conv

#endif // SOURCE_POPTORCH_SYMBOLS_H


================================================
FILE: poptorch/source/RemoveSurplusIdentityLosses.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.

#include <torch/csrc/jit/ir/ir.h>

#include "PoptorchSymbols.hpp"
#include "poptorch/OpBuilder.hpp"
#include "poptorch/PopartCanonicalization.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Logging.hpp"
/*
  Removes losses such that the module only has one loss at the end.
  1. Finds any loss in the module.
  2. Looks through the use-def chain of that loss to see if it is used in
  another loss, if so removes it.
  3. At the end there will only be one loss used.
*/

namespace poptorch {

bool traverseUseDef(torch::jit::Node *node) {
  bool used_in_loss = false;

  // Look through the use-def chain.
  for (torch::jit::Value *output : node->outputs()) {
    // name
    for (torch::jit::Use use : output->uses()) {
      const torch::jit::Symbol kind = use.user->kind();

      // If this is a loss then |node| is used in a loss.
      if (kind == symbols::popart::identityloss) {
        used_in_loss = true;
      }

      // Uses can't be circular.
      used_in_loss |= traverseUseDef(use.user);

      // Early exit if true.
      if (used_in_loss) {
        return true;
      }
    }
  }

  return used_in_loss;
}

void removeSurplusIdentityLosses(torch::jit::Graph *graph) {
  std::unordered_set<torch::jit::Node *> to_delete;

  // For diagnostics.
  std::size_t total_found_losses = 0;
  std::size_t independent_loss_count = 0;

  // For all nodes in the IR.
  for (torch::jit::Node *node : graph->nodes()) {
    WithNodeMetadata meta{node};
    const torch::jit::Symbol kind = node->kind();

    // For each loss see if it is used in a loss.
    if (kind == symbols::popart::identityloss) {
      total_found_losses++;

      bool used_in_loss = traverseUseDef(node);

      if (used_in_loss) {
        // Remove the node by replacing it with either the input or the input
        // transformed by some operation.
        torch::jit::Node *new_node = node->input()->node();

        // If the operation was performing a reduction replace it with a manual
        // reduction operation.
        const std::size_t reduction = node->i(c10::Symbol::attr("reduction"));

        if (reduction < 2) {
          // Flatten it into 1D.
          torch::jit::Node *flattened =
              createFlatten(graph, {new_node->output()}, 0);
          flattened->moveAfter(node);

          // Reduce across that 1D tensor.
          if (reduction == 0) {
            // Sum
            new_node = createReducesum(graph, {flattened->output()}, {1}, 0);
            new_node->moveAfter(flattened);
          } else if (reduction == 1) {
            // Mean
            new_node = createReducemean(graph, {flattened->output()}, {1}, 0);
            new_node->moveAfter(flattened);
          }
        }

        node->replaceAllUsesWith(new_node);
        to_delete.insert(node);
      } else {
        independent_loss_count++;
      }
    }
  }

  logging::debug("Found {} losses and removed {}", total_found_losses,
                 total_found_losses - independent_loss_count);

  ERROR_ON_MSG(total_found_losses == 0, "Couldn't find a loss in graph!");
  ERROR_ON_MSG(independent_loss_count != 1,
               "Multiple independent losses found"
               " in graph. Graph must have one final loss."
               " Wrap final graph loss in poptorch.identity_loss.");

  // Remove the dead nodes.
  searchAndPossiblyDestroy(to_delete);
}
} // namespace poptorch


================================================
FILE: poptorch/source/RequiresGrad.cpp
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.

#include <torch/csrc/jit/ir/graph_node_list.h>
#include <torch/csrc/jit/ir/ir.h>

#include "poptorch/DispatchTracer.hpp"
#include "poptorch/RequiresGrad.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Logging.hpp"

namespace poptorch {

void fixRequiresGradFromDispatch(torch::jit::Graph *graph) {
  // For each output of each node in the graph.
  for (auto *node : graph->nodes()) {
    for (auto *output : node->outputs()) {
      auto tensor_type = output->type()->cast<c10::TensorType>();
      if (!tensor_type) {
        continue;
      }
      auto device = tensor_type->device();
      if (!device) {
        continue;
      }
      if (device->type() != at::DeviceType::IPU) {
        continue;
      }
      // If the output is an IPU floating-point tensor, check if any
      // of the inputs has requires_grad set, and update the Value if
      // needed.
      bool requires_grad = false;
      if (tensor_type->scalarType().has_value() &&
          c10::isFloatingType(tensor_type->scalarType().value())) {
        for (auto *input : node->inputs()) {
          if (input->requires_grad()) {
            requires_grad = true;
            break;
          }
        }
      }
      if (requires_grad != output->requires_grad()) {
        logging::trace("[requires_grad] Set requires_grad={} on node {}",
                       requires_grad, nodeToString(node));
        output->setType(tensor_type->withRequiresGrad(requires_grad));
      }
    }
  }
}

} // namespace poptorch


================================================
FILE: poptorch/source/SessionOptionsParser.cpp
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
#include "poptorch/SessionOptionsParser.hpp"

#include "popart_compiler/Compiler.hpp"
#include "popart_compiler/Utils.hpp"
#include "poptorch/ImplicitCasting.hpp"
#include "poptorch_logging/Tracepoint.hpp"

namespace poptorch {

float IPyValue::toFloatWithRangeCheck() const {
  // A python "float" is a double
  const double value = toDouble();

  ERROR_ON_MSG(value > std::numeric_limits<float>::max(),
               value << " is too high for a Popart float attribute.");
  ERROR_ON_MSG(value < std::numeric_limits<float>::lowest(),
               value << " is too low for a Popart float attribute.");
  return static_cast<float>(value);
}

std::vector<std::string> IPyValue::toVectorString() const {
  std::vector<std::string> out;
  out.reserve(getListSize());
  forEachInList([&out](const IPyValue &val) { out.push_back(val.toString()); });
  return out;
}

SessionOptionsParser::~SessionOptionsParser() = default;

popart_compiler::SessionOptions &SessionOptionsParser::options() {
  return *_opts;
}

SessionOptionsParser::SessionOptionsParser(const IPyValue &py_opts)
    : _opts(std::make_unique<popart_compiler::SessionOptions>()) {
  const logging::LogContext ctx_func("parseSessionOptions");
  // steps, replicationFactor, profile
  auto &options = *_opts;

  py_opts.forEachInDict([&options, &py_opts](const IPyValue &name_val,
                                             const IPyValue &value) {
    const auto name = name_val.toString();
    const logging::LogContext ctx("option: " + name);

    // Options excluded here:
    //  - patterns_level is handled at the same time as "patterns".
    //  - anchored_tensors is dealt with exclusively in Python.
    if (name == "patterns_level" || name == "anchored_tensors") {
      return;
    }

    if (name == "compilation_progress_bar_fn") {
      options.setCompilationProgressLogger(value.toFunction());
    } else if (value.isBoolean()) {
      options.addBoolOption(name.c_str(), value.toBoolean());
    } else if (value.isDouble()) {
      options.addDoubleOption(name.c_str(), value.toDouble());
    } else if (value.isInt()) {
      options.addUint64Option(name.c_str(), value.toUInt64());
    } else if (value.isString()) {
      options.addStringOption(name.c_str(), value.toString().c_str());
    } else if (value.isSetListOrTuple()) {
      value.forEachInList([&options, &name](const IPyValue &str_opt) {
        options.insertStringOption(name.c_str(), str_opt.toString().c_str());
      });
    } else if (value.isDict()) {
      if (name == "available_memory_proportion") {
        value.forEachInDict(
            [&options](const IPyValue &ipu, const IPyValue &memory) {
              options.setMemoryProportion(ipu.toUInt64(),
                                          memory.toFloatWithRangeCheck());
            });
      } else if (name == "patterns") {
        auto patterns_level = py_opts.getFromDict("patterns_level");
        ERROR_ON_MSG(patterns_level == nullptr,
                     "PopART option 'patterns' should not be set "
                     "without first setting 'patterns_level'.");

        options.setPatternsLevel(patterns_level->toUInt64());
        value.forEachInDict([&options](const IPyValue &pattern,
                                       const IPyValue &enabled) {
          options.addPattern(pattern.toString().c_str(), enabled.toBoolean());
        });
      } else if (name.rfind("location_", 0) == 0) {
        value.forEachInDict([&options, &name](const IPyValue &tensor,
                                              const IPyValue &location) {
          options.setTensorLocation(name.c_str(), tensor.toString().c_str(),
                                    location.toUInt64());
        });
      } else {
        value.forEachInDict([&options, &name](const IPyValue &str_key,
                                              const IPyValue &str_value) {
          options.insertStringPairOption(name.c_str(),
                                         str_key.toString().c_str(),
                                         str_value.toString().c_str());
        });
      }
    } else {
      ERROR("Unknown value type " << value.type() << " for option " << name);
    }
  });
}

} // namespace poptorch


================================================
FILE: poptorch/source/Utils.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#include <torch/csrc/jit/ir/ir.h>

#include <spdlog/fmt/fmt.h>
#include <spdlog/fmt/ostr.h>

#include <cstring>
#include <sstream>
#include <unordered_set>

#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

#include "PoptorchSymbols.hpp"
#include "popart_canonicalization/PopartCanonicalizationUtils.hpp"
#include "poptorch/OpBuilder.hpp"
#include "poptorch/Utils.hpp"

namespace poptorch {

torch::jit::Node *findEarliestUser(const torch::jit::Value *value) {
  const auto &uses(value->uses());
  if (uses.empty()) {
    return nullptr;
  }

  torch::jit::Node *earliest_user = uses[0].user;
  for (size_t i = 1; i < uses.size(); i++) {
    auto *node = uses[i].user;
    if (node->isBefore(earliest_user)) {
      earliest_user = node;
    }
  }
  return earliest_user;
}

bool isNondeterministic(const torch::jit::Node &node) {
  if (node.isNondeterministic()) {
    return true;
  }

  // Handle extra cases until this is fixed upstream
  // https://github.com/pytorch/pytorch/issues/52599
  static const auto non_deterministic_nodes = {
      c10::aten::normal,   c10::aten::normal_,   c10::aten::feature_dropout,
      c10::aten::randint,  c10::aten::bernoulli, c10::aten::bernoulli_,
      c10::aten::uniform_, c10::aten::randperm,  c10::aten::exponential_,
      c10::aten::random_,
  };

  return std::find(non_deterministic_nodes.begin(),
                   non_deterministic_nodes.end(),
                   node.kind()) != non_deterministic_nodes.end();
}

std::string nodeToString(const torch::jit::Node *node) {
  std::stringstream ss;
  node->print(ss, 0, nullptr, true, false, false, false);
  std::string node_str = ss.str();
  return node_str;
}

std::string scalarTypeToOnnxString(const at::ScalarType type) {
  switch (type) {
  case at::ScalarType::Byte:
    return "UINT8";
  case at::ScalarType::Char:
    return "INT8";
  case at::ScalarType::Short:
    return "INT16";
  case at::ScalarType::Int:
    return "INT32";
  case at::ScalarType::Long:
    return "INT64";
  case at::ScalarType::Half:
    return "FLOAT16";
  case at::ScalarType::Float:
    return "FLOAT";
  case at::ScalarType::Double:
    return "DOUBLE";
  case at::ScalarType::ComplexHalf:
    return "UNDEFINED";
  case at::ScalarType::ComplexFloat:
    return "COMPLEX64";
  case at::ScalarType::ComplexDouble:
    return "COMPLEX128";
  case at::ScalarType::Bool:
    return "BOOL";
  case at::ScalarType::BFloat16:
    return "BFLOAT16";
  case at::ScalarType::QInt8:
  case at::ScalarType::QUInt8:
  case at::ScalarType::QInt32:
    return "UNDEFINED";
  default:
    return "(unknown type)";
  }
}

at::ScalarType onnxStrToScalarType(const char *type_str) {
  if (strcmp(type_str, "UINT8") == 0) {
    return at::ScalarType::Byte;
  }
  if (strcmp(type_str, "INT8") == 0) {
    return at::ScalarType::Char;
  }
  if (strcmp(type_str, "INT16") == 0) {
    return at::ScalarType::Short;
  }
  if (strcmp(type_str, "INT32") == 0) {
    return at::ScalarType::Int;
  }
  if (strcmp(type_str, "INT64") == 0) {
    return at::ScalarType::Long;
  }
  if (strcmp(type_str, "FLOAT16") == 0) {
    return at::ScalarType::Half;
  }
  if (strcmp(type_str, "FLOAT") == 0) {
    return at::ScalarType::Float;
  }
  if (strcmp(type_str, "DOUBLE") == 0) {
    return at::ScalarType::Double;
  }
  if (strcmp(type_str, "COMPLEX64") == 0) {
    return at::ScalarType::ComplexFloat;
  }
  if (strcmp(type_str, "COMPLEX128") == 0) {
    return at::ScalarType::ComplexDouble;
  }
  if (strcmp(type_str, "BOOL") == 0) {
    return at::ScalarType::Bool;
  }
  if (strcmp(type_str, "BFLOAT16") == 0) {
    return at::ScalarType::BFloat16;
  }

  ERROR("No at::scalar_type for " << type_str);
}

at::ScalarType coerceToSupportedType(at::ScalarType type) {
  switch (type) {
  case at::ScalarType::Double:
    return at::ScalarType::Float;
  case at::ScalarType::Long:
    return at::ScalarType::Int;
  default:
    break;
  }
  return type;
}

torch::jit::Node *createAndInsertCastOp(torch::jit::Graph *graph,
                                        torch::jit::Value *val,
                                        at::ScalarType type) {
  // create args for cast torch value to type
  auto *const long_dtype = insertConstant(graph, type);
  auto *const false_val = insertConstant(graph, false);
  auto *const none = graph->createNone();
  insertNodeInGraph(graph, none);

  // create and add upcast index to long
  auto *cast = createAndInsertNode(graph, c10::aten::to,
                                   {val, long_dtype, false_val /*non_blocking*/,
                                    false_val /*copy*/,
                                    none->output() /*memory_format*/});
  cast->output()->setType(
      val->type()->expect<c10::TensorType>()->withScalarType(type));

  return cast;
}

namespace {
// Adds a null pointers for every unused tensor in an unused tuple
void addNullPtrsForUnusedTuple(const c10::TupleType *tuple_type,
                               std::vector<torch::jit::Value *> *tensors) {
  for (const auto &element : tuple_type->elements()) {
    switch (element->kind()) {
    case c10::TypeKind::TensorType: {
      tensors->push_back(nullptr);
      break;
    }
    case c10::TypeKind::TupleType: {
      auto type = element->expect<c10::TupleType>();
      addNullPtrsForUnusedTuple(type.get(), tensors);
      break;
    }
    default: {
      ERROR("Unsupported input type '" << c10::typeKindToString(element->kind())
                                       << "'");
    }
    }
  }
}

void processInput(torch::jit::Graph *graph, torch::jit::Value *input,
                  std::vector<torch::jit::Value *> *tensors) {
  switch (input->type()->kind()) {
  case c10::TypeKind::TensorType:
    ERROR_ON(input->node()->kind() != c10::prim::Param &&
             input->node()->kind() != c10::prim::TupleUnpack);
    tensors->push_back(input);
    break;

  case c10::TypeKind::ListType: // Fallthrough.
  case c10::TypeKind::TupleType: {
    // Find the TupleUnpack node
    if (input->hasUses()) {
      ERROR_ON(input->uses().size() != 1);
      auto *unpack = input->uses()[0].user;
      ERROR_ON(unpack->kind() != c10::prim::TupleUnpack);
      for (auto *element : unpack->outputs()) {
        // Recurse for nested tuple support
        processInput(graph, element, tensors);
      }
    } else {
      // We need placeholders or the values will not align with input tensors
      auto tuple_type = input->type()->expect<c10::TupleType>();
      addNullPtrsForUnusedTuple(tuple_type.get(), tensors);
    }
    break;
  }

  default:
    ERROR("Unsupported input type '"
          << c10::typeKindToString(input->type()->kind()) << "'");
  }
}
} // namespace

std::vector<torch::jit::Value *>
collapsedGraphInputHierachy(torch::jit::Graph *graph) {
  std::vector<torch::jit::Value *> tensors;

  for (auto *input : graph->inputs()) {
    processInput(graph, input, &tensors);
  }

  return tensors;
}

size_t numTensorsForType(const c10::TypePtr &type) {
  switch (type->kind()) {
  case c10::TypeKind::TensorType:
    return 1;

  case c10::TypeKind::ListType: {
    const auto list_type = type->cast<ListTypeWithNumElements>();
    ERROR_ON(!list_type);
    return list_type->numElements();
  }
  case c10::TypeKind::TupleType: {
    size_t num_tensors = 0;
    const auto tuple = type->expect<c10::TupleType>();
    for (const auto &element_type : tuple->elements()) {
      num_tensors += numTensorsForType(element_type);
    }
    return num_tensors;
  }
  default:
    ERROR("Unsupported output type '" << c10::typeKindToString(type->kind())
                                      << "'");
  }
}

namespace {
bool shouldDestroy(torch::jit::Node *node) {
  // Skip parameters and nodes with any uses.
  return !(node->kind() == c10::prim::Param || node->hasUses());
}

// Store the inputs used by this node.
// Ops may use the same input twice, so use a set to store only unique inputs.
std::unordered_set<torch::jit::Node *> copyInputs(torch::jit::Node *node) {
  std::unordered_set<torch::jit::Node *> inputs;
  for (torch::jit::Value *user : node->inputs()) {
    inputs.insert(user->node());
  }
  return inputs;
}

void searchAndPossiblyDestroyInternal(
    torch::jit::Node *node, std::unordered_set<torch::jit::Node *> *destroyed) {
  if (destroyed->count(node) != 0u) {
    return;
  }
  if (!shouldDestroy(node)) {
    return;
  }

  const auto inputs = copyInputs(node);
  node->destroy();
  destroyed->insert(node);

  // If any of the previously used values now have no users repeat the process
  // for them.
  for (auto *user : inputs) {
    searchAndPossiblyDestroyInternal(user, destroyed);
  }
}
} // namespace

void searchAndPossiblyDestroy(
    const std::unordered_set<torch::jit::Node *> &to_test) {
  std::unordered_set<torch::jit::Node *> destroyed;
  for (auto *node : to_test) {
    searchAndPossiblyDestroyInternal(node, &destroyed);
  }
}

void removeAndPossiblyDestroyAllInputs(torch::jit::Node *node) {
  std::unordered_set<torch::jit::Node *> inputs;
  for (auto *i : node->inputs()) {
    inputs.insert(i->node());
  }
  node->removeAllInputs();
  searchAndPossiblyDestroy(inputs);
}

std::unique_ptr<char[]> stringToUniquePtr(const std::string &str) {
  auto ptr = std::unique_ptr<char[]>(new char[str.size() + 1]);
  str.copy(ptr.get(), std::string::npos);
  ptr.get()[str.size()] = '\0';
  return ptr;
}

// Convert that IR type into a C++ vector of ints.
std::vector<std::int64_t> shapeFromTensor(const torch::jit::Value *value) {
  // Extract the type from the pytorch IR.
  const c10::TensorTypePtr as_tensor = value->type()->expect<c10::TensorType>();
  const c10::VaryingShape varying_shape = as_tensor->sizes();
  const auto &optional_shape_size = varying_shape.size();
  const auto &optional_dims = varying_shape.sizes();

  // Convert that IR type into a C++ vector of ints.
  std::vector<std::int64_t> shape;

  if (optional_shape_size) {
    shape.reserve(optional_shape_size.value());
  }

  if (optional_dims) {
    const auto &dims = optional_dims.value();
    for (const auto &optional_dim : dims) {
      if (optional_dim) {
        shape.push_back(optional_dim.value());
      }
    }
  }

  return shape;
}

void castWeightAndBias(torch::jit::Graph *graph, torch::jit::Value *input,
                       torch::jit::Value *&weight, torch::jit::Value *&bias) {
  const c10::ScalarType input_type =
      input->type()->expect<c10::TensorType>()->scalarType().value();
  if (!isNone(weight->node())) {
    const c10::ScalarType weight_type =
        weight->type()->expect<c10::TensorType>()->scalarType().value();
    if (weight_type != input_type) {
      weight = createCast(graph, weight, input_type)->output();
    }
  }

  if (!isNone(bias->node())) {
    const c10::ScalarType bias_type =
        bias->type()->expect<c10::TensorType>()->scalarType().value();
    if (bias_type != input_type) {
      bias = createCast(graph, bias, input_type)->output();
    }
  }
}

JitTensorInfo::JitTensorInfo(const at::Tensor &tensor) {
  scalar_type = tensor.scalar_type();
  dims = tensor.sizes().vec();
}

JitTensorInfo::JitTensorInfo(torch::jit::Value *value) {
  auto tensor_type = value->type()->cast<at::TensorType>();
  ERROR_ON_MSG(!tensor_type->scalarType().has_value(), "Data type not set");
  ERROR_ON_MSG(!tensor_type->sizes().concrete_sizes().has_value(),
               "Size not set");
  scalar_type = *tensor_type->scalarType();
  dims = *tensor_type->sizes().concrete_sizes();
}

std::string JitTensorInfo::toString() const {
  std::stringstream ss;
  ss << scalar_type << "(";
  std::string sep;

  for (auto d : dims) {
    ss << sep << d;
    sep = ", ";
  }
  ss << ")";
  return ss.str();
}

void validateTensorShapeAndType(torch::jit::Value *value,
                                const at::Tensor &tensor) {
  const JitTensorInfo jit(value);
  const JitTensorInfo torch(tensor);
  const bool match = std::tie(torch.scalar_type, torch.dims) ==
                     std::tie(jit.scalar_type, jit.dims);
  ERROR_ON_MSG(!match, "Shape/Type mismatch: JIT tensor %"
                           << value->debugName() << " " << jit.toString()
                           << " is incompatible with " << torch.toString());
}

void setNodeTensorAttrValue(torch::jit::Node *node,
                            torch::jit::TensorAttr::ConstructorType value) {
  node->ts_(c10::attr::value,
            {std::forward<torch::jit::TensorAttr::ConstructorType>(value)});
}

const torch::jit::TensorAttr::ValueType &
getNodeTensorAttrValue(const torch::jit::Node *node) {
  ERROR_ON_MSG(node->kindOf(c10::attr::value) != torch::jit::AttributeKind::ts,
               "[Internal] expected type 'ts' but got "
                   << torch::jit::toString(node->kindOf(c10::attr::value)));
  const auto &ts = node->ts(c10::attr::value);
  ERROR_ON(ts.size() != 1);
  return ts.at(0);
}

std::string ListTypeWithNumElements::str() const {
  return fmt::format("TensorList[{}]", _num_elements);
}

} // namespace poptorch


================================================
FILE: poptorch/source/dispatch_tracer/CMakeLists.txt
================================================

set(CMAKE_POSITION_INDEPENDENT_CODE ON)

add_library(dispatch_tracer STATIC
  RegisterAtenOverloads.cpp
  CommonHelperFunctions.cpp
  dispatchers/IDispatch.cpp
  dispatchers/JitDispatch.cpp
  InplaceAliasMapper.cpp
  ValueMapper.cpp
  Tensor.cpp
  TypeInferenceHandler.cpp
)

target_link_libraries(dispatch_tracer
  PUBLIC
    torch
  PRIVATE
    poptorch_internal_headers
    poptorch_logging
    poptorch_compiler
    popart_compiler
    poptorch_err
)

set_property(TARGET dispatch_tracer PROPERTY CXX_STANDARD 17)


================================================
FILE: poptorch/source/dispatch_tracer/CommonHelperFunctions.cpp
================================================
// Copyright (c) 2021 Graphcore Ltd. All rights reserved.
#include "CommonHelperFunctions.hpp"

#include <spdlog/fmt/fmt.h>
#include <spdlog/fmt/ostr.h>

#include <ATen/core/function_schema.h>
#include <ATen/core/interned_strings.h>
#include <c10/core/TensorImpl.h>
#include <torch/csrc/jit/ir/ir.h>

#include <map>
#include <unordered_set>

#include "../popart_canonicalization/PopartCanonicalizationUtils.hpp"
#include "InplaceAliasMapper.hpp"
#include "ValueMapper.hpp"
#include "poptorch/DispatchTracer.hpp"
#include "poptorch/OpBuilder.hpp"
#include "poptorch/PopartCanonicalization.hpp"
#include "poptorch/TypeAndConstantCanonicalization.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Logging.hpp"

namespace poptorch {

namespace {

bool isGenericListOfTensors(c10::IValue &value) {
  if (!value.isList()) {
    return false;
  }
  bool not_empty = false;
  for (c10::IValue const list_value : value.toList()) {
    if (!list_value.isTensor()) {
      return false;
    }
    not_empty = true;
  }
  return not_empty;
}

bool isListOfOptionalTensors(c10::IValue &value) {
  if (!value.isList()) {
    return false;
  }
  return value.toList().elementType() ==
         c10::getTypePtr<c10::optional<at::Tensor>>();
}

torch::jit::Value *insertValueIntoGraphAndTrackIt(c10::IValue &value,
                                                  torch::jit::Graph &graph,
                                                  ValueMapper &mapper) {
  if (value.isTensor()) {
    // Handle tensors.
    at::Tensor const tensor = value.toTensor();
    // Undefined tensors are optional tensors.
    if (!tensor.defined()) {
      // Create a null IR value.
      torch::jit::Node *node = graph.createNone();
      insertNodeInGraph(&graph, node);
      return node->output();
    }

    torch::jit::Value *val = mapper.getValueForTensor(tensor);
    if (val == nullptr) {
      ERROR_ON_MSG(tensor.device().type() == c10::DeviceType::IPU,
                   "Attempted to promote a Tensor converted (using "
                   ".to(\"ipu\") or .ipu()) outside an IPUScope or IPUContext "
                   "with the PopART compiler.");

      // This is probably an external tensor that we didn't catch. Assume
      // it's a constant.
      val = insertConstant(graph, copyAndCoerceType(tensor));
      setSourceRangeToCurrentLocation(val->node());
      // Don't track constants in the ValueMapper as they are CPU tensors.
    }

    logging::trace(
        "[DISPATCHER] Tensor input: tensor ptr {} ({}), jit ir %{} (scalar "
        "type {})",
        reinterpret_cast<void *>(tensor.unsafeGetTensorImpl()),
        toString(tensor), val->debugNameBase(),
        val->type()->expect<c10::TensorType>()->scalarType().value_or(
            at::ScalarType::Undefined));

    return val;
  }

  // If a generic list only contains tensors then it is a tensor
  // list and we handle both the same way.
  if (value.isTensorList() || isGenericListOfTensors(value)) {
    // Handle tensor lists.
    std::vector<torch::jit::Value *> list_values;
    if (value.isTensorList()) {
      for (c10::IValue list_value : value.toTensorVector()) {
        list_values.push_back(
            insertValueIntoGraphAndTrackIt(list_value, graph, mapper));
      }
    } else {
      for (c10::IValue list_value : value.toList()) {
        list_values.push_back(
            insertValueIntoGraphAndTrackIt(list_value, graph, mapper));
      }
    }

    // We assume all lists with the same jit values are the same list in python.
    torch::jit::Value *val = mapper.getValueForTensorList(list_values);
    if (val == nullptr) {
      c10::TypePtr type_ptr;
      if (value.isTensorList()) {
        type_ptr = c10::TensorType::get();
      } else if (isListOfOptionalTensors(value)) {
        type_ptr = c10::OptionalType::create(c10::TensorType::get());
      }

      auto *list = graph.createList(type_ptr, list_values);
      insertNodeInGraph(&graph, list);
      val = list->output();
      mapper.addTensorList(list_values, val);
    }
    return val;
  }

  // Assume value is a true constant and not a tensor so we don't have to
  // track it in the value mapper. It will get canonicalised later.
  torch::jit::Value *val = insertConstant(&graph, value);
  ERROR_ON_MSG(val == nullptr, "Internal: graph could not insert a constant");

  logging::trace("[DISPATCHER] Constant input: jit ir %{}, ivalue tag kind: {}",
                 val->debugNameBase(), value.tagKind());

  return val;
}

// Create a node based on the schema which deduces the input types
// from the inputs/stack and the name from the schema. As far as our
// canonicalisation is concerned this *is* the "aten" node it purports to be
// however it may not match it exacty, and is not created by the normal JIT
// process.
torch::jit::Node *
createAtenTarget(torch::jit::Graph &graph, const c10::FunctionSchema &schema,
                 const std::vector<torch::jit::Value *> &inputs,
                 c10::Stack *stack, ValueMapper &mapper) {

  logging::trace("[DISPATCHER] Create aten target {}", schema.name());

  torch::jit::Symbol const symbol =
      torch::jit::Symbol::fromQualString(schema.name());

  // Create the aten target node for our canonicalisation to target.
  torch::jit::Node *aten_target =
      createAndInsertNode(&graph, symbol, inputs, ImplicitCast::None,
                          OutputType::Unknown, schema.returns().size());

  for (std::size_t i = 0; i < aten_target->inputs().size(); ++i) {
    torch::jit::Value *in = aten_target->input(i);
    // If we are a constant.
    if (in->node()->kind() == at::prim::Constant) {
      c10::IValue val = stack->at(i);
      if (val.isTensor()) {
        at::Tensor const as_tensor = val.toTensor();
        // But actually we are a previously seen tensor which has been demoted
        // to a constant.
        torch::jit::Value *new_val = mapper.getValueForTensor(as_tensor);

        if ((new_val != nullptr) && new_val != in) {
          in->replaceAllUsesWith(new_val);
          in->node()->destroy();
        }
      }
    }
  }

  return aten_target;
}

} // namespace

at::ScalarType scalarTypeOrDefault(c10::optional<at::ScalarType> dtype) {
  return dtype ? *dtype : at::ScalarType::Float;
}

at::Tensor copyAndCoerceType(const at::Tensor &tensor) {
  at::Tensor const copy;
  const auto scalar_type = tensor.scalar_type();
  const auto coerced_scalar_type = coerceToSupportedType(scalar_type);
  if (scalar_type != coerced_scalar_type) {
    static std::uint64_t log_repeat = 0;
    logging::warn(log_repeat,
                  "[DISPATCHER] Tensor (ptr {}) type coerced from {} to {}",
                  static_cast<void *>(tensor.unsafeGetTensorImpl()),
                  scalar_type, coerced_scalar_type);
    return tensor.to(coerced_scalar_type);
  }
  return tensor;
}

std::vector<at::Tensor> getInplaceArguments(const c10::Stack &stack,
                                            const c10::FunctionSchema &schema) {
  logging::trace("[DISPATCHER][JIT] Looking for inplace arguments in schema {}",
                 schema);

  std::vector<at::Tensor> results;

  const auto inplace_arg_id =
      InplaceArgAliasMapper::getInplaceArg(schema.name());

  for (std::size_t arg = 0; arg < schema.arguments().size(); ++arg) {
    const c10::Argument &argument = schema.arguments()[arg];
    const c10::IValue value = stack[arg];

    if (value.isTensor()) {
      at::Tensor const &tensor = value.toTensor();

      // Undefined tensors are optional tensors.
      if (!tensor.defined()) {
        continue;
      }

      if (((argument.alias_info() != nullptr) &&
           argument.alias_info()->isWrite()) ||
          inplace_arg_id == arg) {
        logging::trace("[DISPATCHER][JIT] Found inplace argument, tensor ptr "
                       "{}, tensor {}",
                       reinterpret_cast<void *>(tensor.unsafeGetTensorImpl()),
                       toString(tensor));
        results.push_back(tensor);
      }
    }
  }

  return results;
}

torch::jit::Node *lowerFromSchema(const c10::FunctionSchema &schema,
                                  c10::Stack *stack, torch::jit::Graph &graph,
                                  ValueMapper &mapper) {
  std::vector<torch::jit::Value *> inputs;
  for (std::size_t arg = 0;
       arg < schema.arguments().size() && arg < stack->size(); ++arg) {
    auto value = (*stack)[arg];
    inputs.push_back(insertValueIntoGraphAndTrackIt(value, graph, mapper));
  }
  return createAtenTarget(graph, schema, inputs, stack, mapper);
}

std::string toString(const at::Tensor &t) {
  return fmt::format("sizes={}, type={}", t.sizes(), t.scalar_type());
}

bool isHalfTensor(const at::Tensor &t) {
  return t.scalar_type() == at::ScalarType::Half;
}

c10::Device deviceOrDefaultIpu(c10::optional<c10::Device> device) {
  return device ? *device : c10::Device(at::kIPU, 0);
}

std::string getSchemaKey(const c10::FunctionSchema &schema) {
  // Unfortunately we can't overload based only on the schema symbol as it does
  // not contain the overload info.
  if (schema.overload_name().empty()) {
    return schema.name();
  }

  return schema.name() + "." + schema.overload_name();
}

} // namespace poptorch


================================================
FILE: poptorch/source/dispatch_tracer/CommonHelperFunctions.hpp
================================================
// Copyright (c) 2021 Graphcore Ltd. All rights reserved.
#ifndef POPTORCH_DISPATCH_COMMON_HELPERS_HPP_
#define POPTORCH_DISPATCH_COMMON_HELPERS_HPP_

#include <ATen/core/boxing/KernelFunction.h>
#include <ATen/core/dispatch/Dispatcher.h>
#include <ATen/core/function_schema.h>

#include <string>
#include <unordered_map>
#include <vector>

namespace torch {
namespace jit {
struct Graph;
struct Node;
struct Value;
} // namespace jit
} // namespace torch

namespace poptorch {

class ValueMapper;

at::Tensor copyAndCoerceType(const at::Tensor &tensor);

// From the schema deduce which argument if any is inplace. Only return the
// first one which is inplace. This might include an argument of an op that
// is not truly inplace, e.g. it returns the 'out' argument in the schema
// op(Tensor self, Tensor(a!) out) -> (Tensor(a!)) even when 'self' and 'out'
// are not the same tensor.
std::vector<at::Tensor> getInplaceArguments(const c10::Stack &stack,
                                            const c10::FunctionSchema &schema);

// Using the schema definition as a guide look up all the correct
// torch::jit::Values in the stack and create a jit node with the correct
// symbol. Input values from the stack are also inserted into the graph.
torch::jit::Node *lowerFromSchema(const c10::FunctionSchema &schema,
                                  c10::Stack *stack, torch::jit::Graph &graph,
                                  ValueMapper &mapper);

// Return a string containing the tensor sizes and type.
std::string toString(const at::Tensor &t);

bool isHalfTensor(const at::Tensor &t);

at::ScalarType scalarTypeOrDefault(c10::optional<at::ScalarType> dtype);

// If device is set: return device, otherwise return the default device (ipu0)
c10::Device deviceOrDefaultIpu(c10::optional<c10::Device> device);

std::string getSchemaKey(const c10::FunctionSchema &schema);

} // namespace poptorch

#endif // POPTORCH_DISPATCH_COMMON_HELPERS_HPP_


================================================
FILE: poptorch/source/dispatch_tracer/InplaceAliasMapper.cpp
================================================
// Copyright (c) 2023 Graphcore Ltd. All rights reserved.
#include <spdlog/fmt/fmt.h>

#include "InplaceAliasMapper.hpp"

namespace poptorch {

InplaceArgAliasMapper &InplaceArgAliasMapper::getInstance() {
  static InplaceArgAliasMapper instance;
  return instance;
}

void InplaceArgAliasMapper::registerInplaceArgId(
    const std::string &operator_name, std::size_t alias_arg_id) {

  std::string key =
      _namespace ? fmt::format("{}::{}", _namespace.value(), operator_name)
                 : operator_name;
  _operator_name_to_arg_id.emplace(key, alias_arg_id);
}

std::optional<std::size_t>
InplaceArgAliasMapper::getInplaceArg(const std::string &operator_name) {
  auto &operator_name_to_arg_id = getInstance()._operator_name_to_arg_id;
  const auto it = operator_name_to_arg_id.find(operator_name);
  if (it != operator_name_to_arg_id.end()) {
    return it->second;
  }
  return std::nullopt;
}

void InplaceArgAliasMapper::setNamespace(const std::string &p_namespace) {
  _namespace = p_namespace;
}

void InplaceArgAliasMapper::unsetNamespace() { _namespace = std::nullopt; }

InplaceArgAliasMapperInit::InplaceArgAliasMapperInit(
    void (*init_mapper)(InplaceArgAliasMapper &),
    const std::string &p_namespace) {
  auto &alias_mapper = InplaceArgAliasMapper::getInstance();
  alias_mapper.setNamespace(p_namespace);
  init_mapper(alias_mapper);
  alias_mapper.unsetNamespace();
}

INPLACE_ARG_MAPPER_IMPL(torch_scatter, mapper) {
  mapper.registerInplaceArgId("scatter_mul", 3);
  mapper.registerInplaceArgId("scatter_max", 3);
  mapper.registerInplaceArgId("scatter_min", 3);
}

} // namespace poptorch


================================================
FILE: poptorch/source/dispatch_tracer/InplaceAliasMapper.hpp
================================================
// Copyright (c) 2023 Graphcore Ltd. All rights reserved.
#ifndef POPTORCH_DISPATCH_INPLACE_ALIAS_MAPPER_HPP_
#define POPTORCH_DISPATCH_INPLACE_ALIAS_MAPPER_HPP_

#include <c10/macros/Macros.h>
#include <optional>
#include <string>
#include <unordered_map>

namespace poptorch {

class InplaceArgAliasMapper {
public:
  static InplaceArgAliasMapper &getInstance();
  static std::optional<std::size_t>
  getInplaceArg(const std::string &operator_name);

  void registerInplaceArgId(const std::string &operator_name,
                            std::size_t alias_arg_id);
  void setNamespace(const std::string &p_namespace);
  void unsetNamespace();

private:
  InplaceArgAliasMapper() = default;
  ~InplaceArgAliasMapper() = default;
  InplaceArgAliasMapper(const InplaceArgAliasMapper &) = delete;
  InplaceArgAliasMapper(InplaceArgAliasMapper &&) = delete;
  InplaceArgAliasMapper &operator=(const InplaceArgAliasMapper &) = delete;
  InplaceArgAliasMapper &operator=(InplaceArgAliasMapper &&) = delete;

  std::unordered_map<std::string, std::size_t> _operator_name_to_arg_id;
  std::optional<std::string> _namespace;
};

struct InplaceArgAliasMapperInit {
  InplaceArgAliasMapperInit(void (*init_mapper)(InplaceArgAliasMapper &),
                            const std::string &p_namespace);
};

#define INPLACE_ARG_MAPPER_IMPL(Namespace, mapper)                             \
  _INPLACE_ARG_MAPPER_IMPL(Namespace, mapper, C10_UID)

#define _INPLACE_ARG_MAPPER_IMPL(Namespace, mapper, uid)                       \
  static void Namespace##_##uid##_init_mapper_(InplaceArgAliasMapper &);       \
  static InplaceArgAliasMapperInit Namespace##_##uid##_init_arg_mapper =       \
      InplaceArgAliasMapperInit(&Namespace##_##uid##_init_mapper_,             \
                                #Namespace);                                   \
  static void Namespace##_##uid##_init_mapper_(InplaceArgAliasMapper &(mapper))

} // namespace poptorch

#endif // POPTORCH_DISPATCH_INPLACE_ALIAS_MAPPER_HPP_


================================================
FILE: poptorch/source/dispatch_tracer/README.md
================================================
## Dispatch tracing

Dispatch tracing is our own implementation of torch::jit::trace which allows us
to sidestep some of the constraints of that API as well as trace autograd functions.

We support two backends.

- JIT : Traces the incoming user model into normal PyTorch JIT IR first then
        canonicalises them into our PopART compatible JIT IR.
- MLIR: Traces the model directly into our PyTorch native MLIR backend. Can use
        the above mechanism internally to decompose operations into the PopART
        subset or support them directly.

RegisterAtenOverloads intercepts the initial call from PyTorch then directs that
to whichever backend is active. A backend must provide a fallback operation and
a function for any overloaded PyTorch function which cannot be "boxed" or has
unique properties which make it easier.

# JIT

JIT works by using the normal PyTorch JIT API to turn the given OperatorHandle and Stack (of at::tensors/scalar/vector types) into JIT nodes. We then canonicalise that into our own IR.

Once the graph has been traced, the traced graph can be retrieved and used in our compile process as a stand in for the normal torch::jit::Trace compiledgraph. Most cleanup stages are no longer required at this point.

Models can still only be traced in inference mode, with PopART optionally applying its own autograd to turn the traced inference graph into a training graph.


# MLIR

MLIR is somewhat more complex as it is able to trace more of the graph as it uses the PyTorch autograd and gradients directly. This means it gets exposed to more of PyTorch so must handle more unexpected but legal inputs. For example in the autograd PyTorch stores variables for later processing, like the forward input to be later retrieved in the backward pass. In some of these cases PyTorch will softcopy the tensor by just swapping the storage pointer. However to our eyes it is a new tensor. So in the MLIR path we have to handle more tensor to value resolution code.

Other than having to faithfully lower more varied legal input than in JIT the main difference is that it has two paths to lower a node.

- It can use the JIT path to guarantee it can support at least as much as PopART and reuses our canonicalisation code to break down nodes further.
- It can directly map a torch operation onto IR without needing canonicalisation.

See CompilerDispatchTable.cpp for all the calls. The API with MLIR is generated automatically by MLIR and can be seen in the poptorch_compiler pytorch_bridge include folder.

- DirectlySupportedOps.h.inc : Maps aten operations directly onto an MLIR operation.
- PopartAPISupportedOps.h.inc: Maps aten operations onto the PopART subset via unpacking JIT arguments, just like LowerToPopart.

# Code overview

| File | Description |
| ---- | --- |
| RegisterAtenOverloads.cpp | Dispatcher point of first contact. Registers hooks with PyTorch to pick up the incoming calls. |
| ValueMapper.cpp/hpp | Handles some state/logic to help map at::Tensors onto IR values and MLIR Tensors. |
| CommonHelperFunctions.cpp/hpp | Helper functions used by JIT and MLIR backends which handle the JIT graph. |
| dispatchers | Folder containing the backend specific dispatch code. |
| Tracer.hpp | Abstract backend definition. |
| JitDispatch.hpp/cpp | Contains the implementation of the JIT backend. |
| MLIRDispatch.hpp/cpp | Contains the implementation of the MLIR backend. |
| CompilerDispatchTable.cpp | Dispatch table used by MLIR backend |

See MLIR section for details on DirectlySupportedOps/PopartAPISupportedOps.


================================================
FILE: poptorch/source/dispatch_tracer/RegisterAtenOverloads.cpp
================================================
// Copyright (c) 2021 Graphcore Ltd. All rights reserved.
#include <ATen/Operators.h>
#include <ATen/core/List.h>
#include <ATen/core/function_schema.h>
#include <ATen/native/CPUFallback.h>
#include <c10/core/MemoryFormat.h>
#include <c10/core/ScalarType.h>
#include <c10/core/TensorImpl.h>
#include <torch/csrc/autograd/autograd_not_implemented_fallback.h>
#include <torch/csrc/jit/frontend/source_range.h>
#include <torch/csrc/jit/frontend/tracer.h>
#include <torch/csrc/jit/ir/ir.h>
#include <torch/csrc/jit/runtime/interpreter.h>
#include <torch/types.h>

#include <memory>
#include <set>
#include <string>
#include <unordered_map>

#include "../PoptorchSymbols.hpp"
#include "../popart_canonicalization/PopartCanonicalizationUtils.hpp"
#include "CommonHelperFunctions.hpp"
#include "Tensor.hpp"
#include "poptorch/DispatchTracer.hpp"
#include "poptorch/InplaceOps.hpp"
#include "poptorch/Utils.hpp"

#include "poptorch_err/ExceptionHandling.hpp"

#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

#include "dispatchers/IDispatch.hpp"
#include "pytorch_bridge/IpuSession.hpp"

#include "dispatchers/JitDispatch.hpp"

#include "pytorch_bridge/CompilerOptions.hpp"

// The functions in this file are called via Torch's dispatcher, therefore
// we should only catch the exceptions which are not handled by
// the dispatcher.
#define PTC(f)                                                                 \
  PoptorchCatchWrapperImpl<poptorch::throwPoptorchError, /*catch_all=*/false,  \
                           decltype(&(f)), f>::wrap
#define PTC_BOXED(f) torch::CppFunction::makeFromBoxedFunction<PTC(f)>()

namespace poptorch {

namespace {

std::string valueToString(const c10::IValue &ivalue) {
  if (ivalue.isTensor()) {
    return str(ivalue.toTensor());
  }
  // TODO(T59880)
  // Don't rely on operator<< for everything as we're currently using
  // the XLA dispatch key but using our own Tensor type: bad things
  // might happen if upstream torch tries to print a tensor by itself.
  if (ivalue.isNone() || ivalue.isScalar() || ivalue.isString() ||
      ivalue.isDevice() || ivalue.isStream() || ivalue.isObject() ||
      ivalue.isEnum()) {
    std::stringstream ss;
    ss << ivalue;
    return ss.str();
  }
  if (ivalue.isList()) {
    std::stringstream ss;
    std::string sep;
    ss << ivalue.tagKind() << " [";
    for (const auto &v : ivalue.toList()) {
      ss << sep << valueToString(v);
      sep = ", ";
    }
    ss << "]";
    return ss.str();
  }
  return "<" + ivalue.tagKind() + ">";
}

bool isIpuDevice(const c10::Device &d) {
  return d.type() == c10::DeviceType::IPU;
}

/*
 * The dispatchers are statically registered and called without any additional
 * context so we need a static structure to handle the initial interception.
 * Afterwards we redirect to one of the handlers to avoid keeping around too
 * much static state.
 */
struct GlobalTracerContext {
  // When we are in a live dispatch context. Used to prevent redispatch back
  // to us when we call CPU implementations and to call CPU when we are in
  // BackendSelect and out of scope.
  inline bool isDispatchOn() { return dispatch_on; }

  bool hasActiveDispatch() { return static_cast<bool>(_active_dispatch); }

  IDispatch *activeDispatch() {
    ERROR_ON_MSG(!_active_dispatch, "There is no active dispatch");
    return _active_dispatch.get();
  }

  void resetActiveDispatch(std::unique_ptr<IDispatch> new_dispatch) {
    _active_dispatch = std::move(new_dispatch);
  }

  void updatePythonCallstack() {
    activeDispatch()->setPythonStack(torch::jit::tracer::pythonCallstack());
  }

  void throwPoptorchError(const PoptorchErrorInfo &info) {
    if (_poptorch_error_thrower) {
      _poptorch_error_thrower(info);
    }
  }

  // A simple guard to stop us from redispatching when we are already in a
  // dispatch context.
  bool dispatch_on{false};

  // A state used to determine if the new tensors we receive from the dispatcher
  // are inputs or parameters.
  bool moving_parameters{false};

  // A state used to determine whether we are currently registering output
  // tensors for the graph (in IPUScope.outputs()). If we're not, moving
  // output tensors may result in bad data, so we warn. An example of when
  // this might happen is using torch dynamic slicing in the dispatcher
  // (instead of poptorch.dynamic_slice()).
  bool moving_outputs{false};

  // We can't make the difference between inputs and constants so for
  // now we ask the user to manually specify the input tensors.
  // We use TensorImpl* cast as void* to identify them.
  //
  // Note: these should only be used for pointer comparisons and should never
  // be dereferenced as TensorImpl objects as we don't know if they still
  // exist.
  std::set<void *> graph_inputs;

  // Create and store Tensors...
  TensorStore tensor_store;

  void setPoptorchErrorThrower(PoptorchErrorThrower thrower) {
    _poptorch_error_thrower = std::move(thrower);
  }

private:
  // The active dispatcher. Created once upon dispatch start.
  std::unique_ptr<IDispatch> _active_dispatch;
  PoptorchErrorThrower _poptorch_error_thrower;
};

std::unique_ptr<GlobalTracerContext> context =
    std::make_unique<GlobalTracerContext>();

GlobalTracerContext &getContext() { return *context; }

// Poplar doesn't support long, so cast to int if needed.
at::Tensor downCastIfNeeded(const at::Tensor &t) {
  if (t.scalar_type() == at::ScalarType::Long) {
    return t.to(at::ScalarType::Int);
  }
  if (t.scalar_type() == at::ScalarType::Double) {
    return t.to(at::ScalarType::Float);
  }
  return t;
}

// NOLINTNEXTLINE
void hostSideCast(void *dest, c10::ScalarType dest_scalar_type, void *src,
                  const void *src_end, c10::ScalarType src_scalar_type) {
  // NOLINTNEXTLINE
  AT_DISPATCH_ALL_TYPES_AND(
      at::ScalarType::Half, dest_scalar_type, "copy_", [&] {
        using dest_t = scalar_t;

        // NOLINTNEXTLINE
        AT_DISPATCH_ALL_TYPES_AND(
            at::ScalarType::Half, src_scalar_type, "copy_", [&] {
              scalar_t *src_ = reinterpret_cast<scalar_t *>(src);
              dest_t *dest_ = reinterpret_cast<dest_t *>(dest);

              // TODO(T69558): use vectorised casts
              // at::vec::convert(src, dest, numel);

              while (reinterpret_cast<void *>(src_) != src_end) {
                *(dest_++) =
                    c10::static_cast_with_inter_type<dest_t, scalar_t>::apply(
                        *(src_++));
              }
            });
      });
}

// Return true if the given IPU tensor is a parameter.
inline bool isParameter(const at::Tensor &tensor) {
  ERROR_ON(!getContext().hasActiveDispatch());
  return getContext().activeDispatch()->isParameter(tensor);
}

// copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
void copyInplace(const c10::OperatorHandle &op, c10::Stack *stack) {
  const c10::FunctionSchema &schema = op.schema();
  const auto num_arguments = schema.arguments().size();
  auto arguments = torch::jit::last(stack, num_arguments);

  // In an ideal world self would be allowed to change to reflect type coercion.
  // Unfortunately, pytorch's boxed function interface does not properly support
  // outputs. To work around this if need to re-allocate self we map both the
  // new and old values is the value mapper within the dispatcher.
  // Self is marked as const here to ensure we don't accidentally change it
  const at::Tensor self = arguments.at(0).toTensor();
  const at::Tensor src = arguments.at(1).toTensor();

  logging::debug("[DISPATCHER] Intercepting aten::copy_");
  logging::trace("[Input] self {}", str(self));
  logging::trace("[Input] src {}", str(src));

  // In eager mode the dispatcher is always active so this will only be true
  // when working with static graphs
  if (!getContext().hasActiveDispatch()) {
    if (self.is_ipu() && src.is_cpu()) {
      logging::trace("copy_ CPU -> IPU, outside dispatch");
      auto scalar_type = src.scalar_type();
      auto coerced_type = coerceToSupportedType(scalar_type);
      ERROR_ON_MSG(scalar_type != coerced_type,
                   "Unsupported scalar type `"
                       << scalar_type << "'. Please cast to `" << coerced_type
                       << "' before moving this tensor to the IPU.");
      getContext().tensor_store.copyFromCpu(self, src);
    } else if (self.is_cpu() && src.is_ipu()) {
      logging::trace("copy_ IPU -> CPU, outside dispatch");
      getContext().tensor_store.copyToCpu(self, src);
    } else if (self.is_ipu() && src.is_ipu()) {
      if (!getHostBuffer(self).hasData()) {
        getContext().tensor_store.allocateBuffer(self);
      }

      const auto &self_buffer = getHostBuffer(self).getCpuData();
      const auto &src_buffer = getHostBuffer(src).getCpuData();

      ERROR_ON(!src_buffer);

      if (self.dtype() != src.dtype()) {
        logging::trace("copy_ cast from {} to {} on CPU, outside dispatch",
                       src.dtype(), self.dtype());
        hostSideCast(
            self_buffer->data(), self.scalar_type(), src_buffer->data(),
            src_buffer->data() + src_buffer->size(), src.scalar_type());
      } else {
        ERROR_ON_MSG(self_buffer->size() != src_buffer->size(),
                     "Failed to copy_ outside dispatch: src and self host-side "
                     "buffer sizes are not equal.");
        *self_buffer = *src_buffer;
      }
    } else {
      ERROR("Intercepted unexpected copy_ outside dispatch: only copies "
            "between CPU, IPU tensors as well as between IPU tensors "
            "themselves are supported.");
    }

    torch::jit::drop(stack, num_arguments);
    torch::jit::push(stack, self);
    return;
  }

  getContext().updatePythonCallstack();

  if (self.is_ipu()) {
    if (src.is_cpu()) {
      std::stringstream ss;
      ss << "copy_ CPU -> IPU ";
      if (isParameter(self) || getContext().moving_parameters) {
        getContext().activeDispatch()->addParameter(downCastIfNeeded(src),
                                                    self);
        // Make sure the parameter flag is preserved.
        ss << "parameter";
      } else {
        ERROR_ON_MSG(
            src.requires_grad(),
            "An input tensor to an IPU model can not have requires_grad set "
            "to True.");

        if (getContext().graph_inputs.count(src.unsafeGetTensorImpl()) > 0) {
          getContext().activeDispatch()->addInput(downCastIfNeeded(src), self);
        } else {
          getContext().activeDispatch()->addConstant(downCastIfNeeded(src),
                                                     self);
        }
        ss << "input";
        // Make sure the parameter flag is preserved.
      }
      ss << ", new self " << str(self);
      logging::debug(ss.str().c_str());

      torch::jit::drop(stack, num_arguments);
      torch::jit::push(stack, self);
    } else {
      ERROR_ON(!src.is_ipu());
      logging::debug("copy_ IPU {} -> IPU {}", src.dtype(), self.dtype());
      getContext().activeDispatch()->fallback(op, stack);
    }
  } else {
    ERROR_ON(!self.is_cpu());
    if (src.is_ipu()) {
      ERROR_ON_MSG(!getContext().moving_outputs,
                   "Illegal move to CPU (via `.to(\"cpu\")`) when using the "
                   "dispatcher. Instead, return this output as an IPU tensor.");
      logging::debug("copy_ output IPU -> CPU");
      getContext().activeDispatch()->addOutput(src, self);

      torch::jit::drop(stack, num_arguments);
      torch::jit::push(stack, self);
    } else {
      ERROR("Unexpected tensor of type "
            << src.unsafeGetTensorImpl()->device_type()
            << ", did you forget to move a tensor to "
               "the IPU?");
    }
  }
}

} // namespace

void startParametersMove() { getContext().moving_parameters = true; }

void endParametersMove() { getContext().moving_parameters = false; }

void startOutputsMove() { getContext().moving_outputs = true; }

void endOutputsMove() { getContext().moving_outputs = false; }

// Turn on.
void startDispatch() { getContext().dispatch_on = true; }

void setPoptorchErrorThrower(PoptorchErrorThrower thrower) {
  getContext().setPoptorchErrorThrower(std::move(thrower));
}

void throwPoptorchError(const PoptorchErrorInfo &info) {
  getContext().throwPoptorchError(info);
}

// Turn off.
void endDispatch(bool error_occurred) {
  getContext().dispatch_on = false;
  if (error_occurred) {
    // If an error occurred we need to destroy the dispatcher as it will be in
    // an inconsistent state.
    destroyDispatcher();
  }
}

// Cleanup on exit callback to avoid global destructor ordering issues
void poptorchAtExit() {
  // Ensure that the context is deleted before globals are destroyed to avoid
  // issues with global destructor ordering
  context.reset();
}

// Destroys the dispatcher after we have finished compiling
void destroyDispatcher() {
  if (getContext().isDispatchOn()) {
    endDispatch();
  }
  getContext().resetActiveDispatch(nullptr);
}

void setParameterName(const at::Tensor &tensor, const std::string &name) {
  getContext().activeDispatch()->setParameterName(tensor, name);
}

std::string getParameterName(torch::jit::Value *value) {
  return getContext().activeDispatch()->getParameterName(value);
}

void setParameterPerReplica(const std::string &param_name,
                            const at::Tensor &tensor, int comm_group_type,
                            int shards, int variable_retrieval_mode) {
  getContext().activeDispatch()->setParameterPerReplica(
      param_name, tensor, comm_group_type, shards, variable_retrieval_mode);
}

bool getParameterPerReplica(torch::jit::Value *value,
                            PerReplicaSettings &settings) {
  return getContext().activeDispatch()->getParameterPerReplica(value, settings);
}
// Returns true if the current compilation is being handled using a dispatcher.
//
// This is needed because in some cases, we don't want calls to be dispatched to
// us, but still want to maintain information about the dispatcher.
bool isCompilingWithDispatcher() { return getContext().hasActiveDispatch(); }

// Returns true if the dispatcher is currently 'on', and should intercept calls
// to us.
bool isDispatcherOn() { return getContext().isDispatchOn(); }

CompilerOptions
createMLIROptions(const std::vector<std::string> &source_location_excludes) {
  CompilerOptions options;
  std::transform(
      source_location_excludes.begin(), source_location_excludes.end(),
      std::back_inserter(options.dispatcher.source_location_excludes),
      [](const std::string &exclude) {
        return std::vector<char>(exclude.begin(), exclude.end());
      });
  return options;
}

// Take the inputs to the graph and turn them into our IR graph
// inputs/parameters.
void createGraph(TracingMode mode, const std::vector<at::Tensor> &inputs,
                 const CompilerOptions &options) {
  if (mode == TracingMode::POPART) {
    getContext().resetActiveDispatch(
        std::make_unique<JITDispatch>(options, &getContext().tensor_store));
  } else {
    ERROR("Unsupported target");
  }

  getContext().updatePythonCallstack();
  getContext().graph_inputs.clear();
  for (const auto &input : inputs) {
    getContext().graph_inputs.emplace(
        reinterpret_cast<void *>(input.unsafeGetTensorImpl()));
  }
}

void cpuFallback(const c10::OperatorHandle &op, torch::jit::Stack *stack) {
  const auto name = c10::toString(op.operator_name());

  logging::trace("[CPU Fallback] Running {} on CPU", name);

  // Call the actual boxed CPU fallback.
  at::native::cpu_fallback(op, stack);
}

void fallback(const c10::OperatorHandle &op, c10::Stack *stack) {
  const c10::FunctionSchema &schema = op.schema();
  logging::debug("[DISPATCHER] Intercepting {} ", schema);

  getContext().updatePythonCallstack();
  for (const auto &t : *stack) {
    logging::trace("[Input {}] {}", schema.name(), valueToString(t));
  }
  getContext().activeDispatch()->fallback(op, stack);
  for (const auto &t : *stack) {
    logging::trace("[Output {}] {}", schema.name(), valueToString(t));
  }
}

InplaceGraphInfo getInplaceGraphInfo(size_t num_anchors,
                                     bool replicas_needing_broadcast) {
  auto *jit = dynamic_cast<JITDispatch *>(getContext().activeDispatch());
  ERROR_ON_MSG(jit == nullptr, "[User Unreachable] Tracer context is null.");
  return jit->finalizeInplaceGraphInfo(num_anchors, replicas_needing_broadcast);
}

std::shared_ptr<torch::jit::Graph> getTracedGraph() {
  auto *jit = dynamic_cast<JITDispatch *>(getContext().activeDispatch());
  ERROR_ON_MSG(jit == nullptr, "[User Unreachable] Tracer context is null.");

  // Build a list of nodes marked for deletion.
  std::unordered_set<torch::jit::Node *> to_delete;
  for (torch::jit::Node *node : jit->graph->nodes()) {
    if (isMarkedForDeletion(node)) {
      to_delete.insert(node);
    }
  }

  // Remove the dead nodes.
  searchAndPossiblyDestroy(to_delete);

  // Return the real graph because popart_compiler will call
  // getDataSourceForValue() on some of these nodes and if we
  // clone the graph we won't be able to find the mappings.
  return jit->graph;
}

void finalizeGraph() { getContext().activeDispatch()->finalizeGraph(); }

void *getDataSource(const at::Tensor &tensor) {
  return getHostBuffer(tensor).getCpuData()->data();
}

void *getDataSourceForValue(torch::jit::Value *value) {
  return getContext().activeDispatch()->getDataSource(value);
}

bool isParameter(torch::jit::Value *value) {
  return getContext().activeDispatch()->isParameter(value);
}

// This is the function called by Torch to trigger an IPU to Host
// sync: we forward it to the CPU backend which will then issue
// some copy_ calls between IPU and CPU tensors instead.
at::Scalar localScalarDense(const at::Tensor &self) {
  logging::trace("Sync to CPU");

  return at::native::call_fallback_fn<&poptorch::cpuFallback,
                                      ATEN_OP(_local_scalar_dense)>::call(self);
}

at::Scalar item(const at::Tensor &self) {
  ERROR("aten::item is only supported in eager mode, but was intercepted in "
        "a static graph. This means an IPU to CPU copy was triggered before "
        "the end of the graph, for example by calling tensor.item(). "
        "Please ensure that any such copies are removed.");

  return at::native::call_fallback_fn<&poptorch::cpuFallback,
                                      ATEN_OP(item)>::call(self);
}

at::Tensor
emptyBase(at::IntArrayRef size,
          c10::optional<at::ScalarType> dtype = c10::nullopt,
          c10::optional<at::Layout> layout = c10::nullopt,
          c10::optional<at::Device> device = c10::nullopt,
          c10::optional<bool> pin_memory = c10::nullopt,
          c10::optional<at::MemoryFormat> memory_format = c10::nullopt) {
  ERROR_ON(!device); // Internal error: shouldn't happen
  if (isIpuDevice(*device)) {
    // We use the device ID to determine if a tensor is a parameter
    // (device 1) or not (device 0) but in reality all the tensors
    // currently live on the same IPU so always use the default IPU.
    at::Tensor output = getContext().tensor_store.allocateTensor(
        size, dtype, nullptr, deviceOrDefaultIpu({}));
    // TODO(T61576) Find a better way to identify parameters and buffers.
    if (getContext().hasActiveDispatch()) {
      getContext().updatePythonCallstack();
      getContext().activeDispatch()->registerEmptyTensor(
          output, getContext().moving_parameters);
    }

    return output;
  }
  // Native calls are a dispatch endpoint so will not be redispatched.
  at::Tensor output = at::native::empty_cpu(size, dtype, layout, device,
                                            pin_memory, memory_format);
  return output;
}

// Handler for IPU empty tensors: this means the returned tensor must be
// an IPU tensor.
at::Tensor emptyMemoryFormat(
    at::IntArrayRef size, c10::optional<at::ScalarType> dtype = c10::nullopt,
    c10::optional<at::Layout> layout = c10::nullopt,
    c10::optional<at::Device> device = c10::nullopt,
    c10::optional<bool> pin_memory = c10::nullopt,
    c10::optional<at::MemoryFormat> memory_format = c10::nullopt) {

  auto device_or_default = deviceOrDefaultIpu(device);
  logging::debug(
      "[DISPATCHER] Intercepting aten::empty.memory_format, device {}",
      device_or_default.str());
  return poptorch::emptyBase(size, dtype, layout, device_or_default, pin_memory,
                             memory_format);
}

// func: empty_strided(int[] size, int[] stride, *, ScalarType? dtype=None,
// Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
at::Tensor emptyStrided(at::IntArrayRef size, at::IntArrayRef stride,
                        c10::optional<at::ScalarType> dtype = c10::nullopt,
                        c10::optional<at::Layout> layout = c10::nullopt,
                        c10::optional<at::Device> device = c10::nullopt,
                        c10::optional<bool> pin_memory = c10::nullopt) {
  ERROR_ON(!device); // Internal error: shouldn't happen
  ERROR_ON(!isIpuDevice(*device));
  logging::debug("[DISPATCHER] Intercepting aten::empty_strided, device {}",
                 device->str());
  ERROR_ON(at::detail::defaultStrides(size) != stride);
  return emptyBase(size, dtype, layout, device, pin_memory);
}

at::Tensor linalgMatrixNorm(const at::Tensor &self, const at::Scalar &ord,
                            at::IntArrayRef dim, bool keepdim,
                            c10::optional<at::ScalarType> dtype) {
  auto ord_double = ord.toDouble();
  auto abs_ord = std::abs(ord_double);
  if (abs_ord != 2.) {
    // As long as we're not dealing with a 2-norm, we can call the
    // operator as usual, which will redispatch the constituent operations
    return at::native::linalg_matrix_norm(self, ord, dim, keepdim, dtype);
  }
  // The 2-norm is defined as the largest (for +2) or smallest (for -2)
  // singular value of the matrix.
  ERROR("Matrix 2-norm is not supported.");
}

at::Tensor linalgMatrixNormStrOrd(const at::Tensor &self, c10::string_view ord,
                                  at::IntArrayRef dim, bool keepdim,
                                  c10::optional<at::ScalarType> dtype) {
  if (ord != "nuc") {
    // As long as we're not dealing with a nuclear norm, we can call the
    // operator as usual, which will redispatch the constituent operations
    return at::native::linalg_matrix_norm(self, ord, dim, keepdim, dtype);
  }
  // The nuclear norm is defined as the sum of singular values of the matrix.
  ERROR("Matrix nuclear norm is not supported.");
}

// aten::detach(Tensor(a) self) -> (Tensor(a))
void detach(const c10::OperatorHandle &op, c10::Stack *stack) {
  logging::debug("[DISPATCHER] Intercepting aten::detach");

  if (getContext().hasActiveDispatch()) {
    getContext().updatePythonCallstack();

    // Perform the shallow copy and detach.
    getContext().activeDispatch()->detach(op, stack,
                                          getContext().moving_parameters);
  } else {
    const c10::FunctionSchema &schema = op.schema();
    const auto num_arguments = schema.arguments().size();
    const auto arguments = torch::jit::last(stack, num_arguments);

    ERROR_ON(arguments.size() != 1);
    const at::Tensor in = arguments.front().toTensor();

    const at::Tensor out(in.unsafeGetTensorImpl()->shallow_copy_and_detach(
        /*version_counter=*/in.unsafeGetTensorImpl()->version_counter(),
        /*allow_tensor_metadata_change=*/true));

    torch::jit::drop(stack, num_arguments);
    torch::jit::push(stack, out);
  }
}

// NOTE: This gets called by _weight_norm's handler, if certain conditions are
// met. However, those conditions never used to be met, and so we never had to
// implement this handler. Now we do, so for now just emulate the old behaviour.
void weightNormInterface(const c10::OperatorHandle &op, c10::Stack *stack) {
  const auto num_arguments = op.schema().arguments().size();
  auto arguments = torch::jit::last(stack, num_arguments);

  const auto v = arguments.at(0).toTensor();
  const auto g = arguments.at(1).toTensor();
  const std::int64_t dim = arguments.at(2).toInt();

  torch::jit::drop(stack, num_arguments);

  const auto out = v * (g / at::norm_except_dim(v, 2, dim));

  torch::jit::push(stack, out);
  // Strictly speaking the schema of `_weight_norm_interface` returns a
  // (Tensor, Tensor); in its sole usage in `_weight_norm`, only the first
  // member is used, so just return something empty of the right shape.
  torch::jit::push(stack, at::empty_like(g));
}

void replaceValueDispatcher(torch::jit::Value *v_old,
                            torch::jit::Value *v_new) {
  if (!getContext().hasActiveDispatch()) {
    return;
  }
  getContext().activeDispatch()->replaceValue(v_old, v_new);
}

std::uint64_t getIpuTensorId(const at::Tensor &tensor) {
  ERROR_ON_MSG(!isIpuTensor(tensor),
               "You may only call getIpuTensorId on an IPU tensor");
  return ipuTensorId(tensor);
}

} // namespace poptorch

/*
  The actual dispatcher part. Overriding these keys causes most operations to
  fall through to our fallback catchers.
*/

TORCH_LIBRARY_IMPL(_, IPU, m) { m.fallback(PTC_BOXED(poptorch::fallback)); }

TORCH_LIBRARY_IMPL(_, AutogradIPU, m) {
  m.fallback(PTC_BOXED(poptorch::fallback));
}

/*
  There are two kinds of PyTorch ops: the ones that require registration
  (and a backend-specific kernel) and the ones that are optional. If optional
  ops are not registered they get decomposed into several required ops that must
  then be intercepted by the backend provider. More information on this can be
  found at https://pytorch.org/tutorials/advanced/extend_dispatcher.html.

  In essence:
    - required ops have 'dispatch' set to TRUE and 'default' set to FALSE in
      RegistrationDeclarations.h
    - optional ops have 'dispatch' set to FALSE or 'default' set to TRUE in
      RegistrationDeclarations.h

  RegisterOptionalAtenOps.cpp.inc registers the optional ops that our backend
  intercepts.
  RegisterMetaOps.cpp.inc registers the meta implementations of operations
  that are used for type inference
*/
#include "RegisterMetaOps.cpp.inc"
#include "RegisterOptionalAtenOps.cpp.inc"

// These cannot be intercepted using the non-autograd key unless
// torch.inference_mode is used
TORCH_LIBRARY_IMPL(aten, AutogradIPU, m) {
  // This is required to intercept detach calls when moving parameters to the
  // IPU.
  m.impl("detach", PTC_BOXED(poptorch::detach));

  // These must be intercepted at the autograd level otherwise they'll go
  // through fallback
  m.impl("linalg_matrix_norm", PTC(poptorch::linalgMatrixNorm));
  m.impl("linalg_matrix_norm.str_ord", PTC(poptorch::linalgMatrixNormStrOrd));
}

void popArgumentsFromStack(const c10::OperatorHandle &op, c10::Stack *stack) {
  ERROR_ON(op.schema().arguments().size() > stack->size());
  stack->erase(std::prev(stack->end(), op.schema().arguments().size()),
               stack->end());
}

void pushResultsToStack(c10::Stack *stack,
                        const std::vector<c10::IValue> &results) {
  stack->insert(stack->end(), results.begin(), results.end());
}

// Pop op's arguments from the stack, and (if given) push any results to the
// back.
void updateStack(const c10::OperatorHandle &op, c10::Stack *stack,
                 const std::vector<c10::IValue> &results = {}) {
  popArgumentsFromStack(op, stack);
  if (!results.empty()) {
    pushResultsToStack(stack, results);
  }
}

// Get an argument from the given stack.
c10::IValue getNthArgument(const c10::OperatorHandle &op, c10::Stack *stack,
                           size_t n) {
  ERROR_ON(op.schema().arguments().size() > stack->size());
  return stack->at((stack->size() - op.schema().arguments().size()) + n);
}

void opReturningFirstArgument(const c10::OperatorHandle &op,
                              c10::Stack *stack) {
  const auto front = getNthArgument(op, stack, 0);
  updateStack(op, stack, {front});
}

void opWithoutOutputs(const c10::OperatorHandle &op, c10::Stack *stack) {
  if (poptorch::isDispatcherOn()) {
    poptorch::fallback(op, stack);
  } else {
    updateStack(op, stack);
  }
}

void callCpuOp(const c10::OperatorHandle &op, c10::Stack *stack) {
  opWithoutOutputs(op, stack);

  if (poptorch::isDispatcherOn()) {
    poptorch::endDispatch();
  }
}

void endCpuOp(const c10::OperatorHandle &op, c10::Stack *stack) {
  // This op might have been called as part of a CPU model in which case we
  // don't want to re-start the dispatcher.
  if (poptorch::isCompilingWithDispatcher()) {
    poptorch::startDispatch();
    poptorch::fallback(op, stack);
  }

  opReturningFirstArgument(op, stack);
}

at::Tensor castOp(const at::Tensor &tensor, const std::string &type) {
  // If the type to cast to is f16 then we need to cast to f32. The reason being
  // is that by default we will just ignore the type, however this will only
  // work if the original type was f32.

  // Consider:
  /* MyTensor = MyTensor.as(INT8)

    MyTensor = MyTensor.half() # Convert to half.

    out = conv(MyTensor) # This would be an illegal INT8 convolution.
  */
  if (type == "FLOAT16" || type == "FLOAT32") {
    return tensor.to(at::ScalarType::Float);
  }
  return tensor;
}

// c10::List<at::Tensor>
// customOperation(c10::List<at::Tensor> inputs,
//                 std::string name, std::string domain,
//                 int64_t version, int64_t num_outputs,
//                 c10::List<at::Tensor> example_outputs,
//                 std::string attributes_map_id) {
//   return example_outputs;
//  }
void customOperation(const c10::OperatorHandle &op, c10::Stack *stack) {
  auto out = getNthArgument(op, stack, 5);
  updateStack(op, stack, {out});
}

// dynamic_slice(Tensor self, int dim, Tensor start, int size, int step) ->
// Tensor
at::Tensor dynamicSlice(const at::Tensor &self, int64_t dim,
                        const at::Tensor &start, int64_t size, int64_t step) {
  auto st = start.scalar_type();
  std::int64_t start_int;
  if (st == torch::kInt64) {
    start_int = start.data_ptr<std::int64_t>()[0];
  } else if (st == torch::kInt32) {
    start_int = start.data_ptr<std::int32_t>()[0];
  } else if (st == torch::kInt16) {
    start_int = start.data_ptr<std::int16_t>()[0];
  } else {
    ERROR("Expected integer typed start tensor");
  }

  return at::slice(self, dim, {start_int}, {start_int + size}, step);
}

// dynamic_update(Tensor self, Tensor src, int dim, Tensor start, int size) ->
// Tensor
at::Tensor dynamicUpdate(const at::Tensor &self, const at::Tensor &src,
                         int64_t dim, const at::Tensor &start, int64_t size) {
  auto st = start.scalar_type();
  std::int64_t start_int;
  if (st == torch::kInt64) {
    start_int = start.data_ptr<std::int64_t>()[0];
  } else if (st == torch::kInt32) {
    start_int = start.data_ptr<std::int32_t>()[0];
  } else if (st == torch::kInt16) {
    start_int = start.data_ptr<std::int16_t>()[0];
  } else {
    ERROR("Expected integer typed start tensor");
  }

  return at::slice_scatter(self, src, dim, start_int, start_int + size, 1);
}

std::tuple<at::Tensor, at::Tensor, at::Tensor>
ctcBeamSearchDecoder(const at::Tensor &log_probs,
                     const at::Tensor & /*lengths*/, int64_t /*blank*/,
                     int64_t /*width*/, int64_t top_paths) {
  ERROR_ON_MSG(log_probs.sizes().size() != 3,
               "Incorrect shape for first input to CTC beam search decoder.");
  const unsigned input_len = log_probs.sizes()[0];
  const unsigned batch_size = log_probs.sizes()[1];

  const at::Tensor path_probs = at::zeros({batch_size, top_paths});
  const at::Tensor path_lens = at::zeros({batch_size, top_paths});
  const at::Tensor decoded_paths =
      at::zeros({batch_size, top_paths, input_len});

  return {path_probs, path_lens, decoded_paths};
}

// at::Tensor identityLoss(const at::Tensor &t, int64_t reduction)
at::Tensor identityLoss(const at::Tensor &t, int64_t reduction) {
  constexpr int64_t sum = 0;
  constexpr int64_t mean = 1;
  constexpr int64_t none = 2;

  switch (reduction) {
  case sum:
    return at::sum(t);
  case mean:
    return at::mean(t);
  case none:
    return t.clone();
  default:
    ERROR("reduction must be sum (0), mean (1) or none (2)");
  }
}

#define OP_WITHOUT_OUTPUTS(signature)                                          \
  torch::schema(signature, c10::AliasAnalysisKind::CONSERVATIVE),              \
      PTC_BOXED(opWithoutOutputs)

TORCH_LIBRARY(poptorch, m) {
  // These operations have no outputs, and so are registered with side-effects
  // to prevent being pruned by dead-code elimination
  m.def(OP_WITHOUT_OUTPUTS(
      "begin_ipu_block(int stage_id, int phase_id, int ipu_id) -> ()"));
  m.def(OP_WITHOUT_OUTPUTS("end_ipu_block() -> ()"));
  m.def(OP_WITHOUT_OUTPUTS("start_for_loop(Tensor[] inputs) -> ()"));

  m.def(OP_WITHOUT_OUTPUTS("start_if_block(Tensor condition) -> ()"));
  m.def(OP_WITHOUT_OUTPUTS("start_else_block(Tensor[] outputs_true) -> ()"));

  m.def(
      OP_WITHOUT_OUTPUTS("optimizer_group(int group, Tensor[] inputs) -> ()"));
  m.def(OP_WITHOUT_OUTPUTS("begin_multi_conv() -> ()"));
  m.def(OP_WITHOUT_OUTPUTS(
      "end_multi_conv(float[]? "
      "available_memory_proportions, int[]? partials_types, int? "
      "plan_type, int? per_conv_reserved_tiles, float? "
      "cycle_back_off, int[]? enableConvDithering) -> ()"));
  m.def(OP_WITHOUT_OUTPUTS("push_name_scope(str name) -> ()"));
  m.def(OP_WITHOUT_OUTPUTS("pop_name_scope() -> ()"));
  m.def(OP_WITHOUT_OUTPUTS(
      "set_attribute(str attribute, str key, str value) -> ()"));
  m.def(OP_WITHOUT_OUTPUTS("clear_attribute(str attribute, str key) -> ()"));

  // Operations returning the first argument
  m.def("ipu_print_tensor(Tensor self, str title, int print_gradient, int "
        "summarize_threshold, int edge_items, int max_line_width, int digits, "
        "int float_format, str separator, str open_bracket, str close_bracket) "
        "-> Tensor");
  m.def("nop(Tensor self) -> Tensor");
  m.def("end_for_loop(Tensor[] outputs, Tensor[] "
        "inputs, int trip_count) -> Tensor[]");
  m.def("end_if_block(Tensor[] outputs, Tensor condition) -> Tensor[]");

  m.def("set_matmul_serialization(Tensor matmul, str "
        "mode, int factor, bool keep_precision) -> Tensor");
  m.def("set_overlap_for_input(Tensor t, str mode) -> Tensor");
  m.def("set_overlap_for_output(Tensor t, str mode) -> Tensor");
  m.def("recomputation_checkpoint(Tensor self) -> Tensor");
  m.def("set_available_memory(Tensor t, float mem) -> Tensor");

  m.def("custom_operation(Tensor[] inputs, str name, str domain, int "
        "domain_version, int num_outputs, Tensor[] outputs, str attributes) -> "
        "Tensor[]");
  m.def("ctc_beam_search_decoder(Tensor probs, "
        "Tensor lengths, int blank, int beam_width, int "
        "top_paths) -> (Tensor, Tensor, Tensor)");
  m.def("dynamic_slice(Tensor self, int dim, Tensor start, int size, int step) "
        "-> Tensor");
  m.def("dynamic_update(Tensor self, Tensor src, int dim, Tensor start, int "
        "size) "
        "-> Tensor");
  m.def("identity_loss(Tensor x, int reduction) -> Tensor");
  m.def("internal_cast(Tensor self, str dtype) -> Tensor");

  // call_cpu_op and end_cpu_op are special cases because they must
  // immediately switch the dispatcher on/off so the default poptorch
  // fallback cannot be used. They are also registered with side-effects
  // to ensure they are not reintercepted during constexpr evaluation
  m.def(torch::schema("end_cpu_op(Tensor[] output) -> Tensor[]",
                      c10::AliasAnalysisKind::CONSERVATIVE),
        PTC_BOXED(endCpuOp));
  m.def(torch::schema("call_cpu_op(Tensor[] inputs, str name) -> ()",
                      c10::AliasAnalysisKind::CONSERVATIVE),
        PTC_BOXED(callCpuOp));
  m.def("fps(Tensor src, "
        "int[] ptr, float ratio, bool random_start) -> Tensor");
  m.def("nearest(Tensor x, Tensor y, "
        "Tensor batch_x, Tensor batch_y) -> Tensor");
  m.def("nearest_batch_list(Tensor x, Tensor y, "
        "int[] batch_x, int[] batch_y) -> Tensor");
}

TORCH_LIBRARY_IMPL(poptorch, CPU, m) {
  // Operations returning the first argument
  m.impl("end_for_loop", PTC_BOXED(opReturningFirstArgument));
  m.impl("end_if_block", PTC_BOXED(opReturningFirstArgument));
  m.impl("ipu_print_tensor", PTC_BOXED(opReturningFirstArgument));
  m.impl("nop", PTC_BOXED(opReturningFirstArgument));
  m.impl("recomputation_checkpoint", PTC_BOXED(opReturningFirstArgument));
  m.impl("set_available_memory", PTC_BOXED(opReturningFirstArgument));
  m.impl("set_matmul_serialization", PTC_BOXED(opReturningFirstArgument));
  m.impl("set_overlap_for_input", PTC_BOXED(opReturningFirstArgument));
  m.impl("set_overlap_for_output", PTC_BOXED(opReturningFirstArgument));

  // Operations with their own CPU implementations
  m.impl("ctc_beam_search_decoder", PTC(ctcBeamSearchDecoder));
  m.impl("custom_operation", PTC_BOXED(customOperation));
  m.impl("dynamic_slice", PTC(dynamicSlice));
  m.impl("dynamic_update", PTC(dynamicUpdate));
  m.impl("identity_loss", PTC(identityLoss));
  m.impl("internal_cast", PTC(castOp));
}

// By default, if we don't register anything for autograd, the the outputs of
// `poptorch::` ops will have no `grad_fn` (making them leaves). For PopART it's
// not inherently an issue since PopART does its own thing in the backward pass.
// However, PyTorch will error if you put the output of one of these ops through
// an inplace op: `a leaf Variable that requires grad is being used in an
// in-place operation.`
//
// The JIT trace will have the `grad_fn`s filled with whatever the previous
// `grad_fn` of the input was, so this isn't an issue.
//
// Note: Presumably, for non-PopART backends these will need to have
// implementations (`torch::autograd::Function` subclasses).
TORCH_LIBRARY_IMPL(poptorch, AutogradIPU, m) {
  m.impl("begin_ipu_block", torch::autograd::autogradNotImplementedFallback());
  m.impl("end_ipu_block", torch::autograd::autogradNotImplementedFallback());
  m.impl("ipu_print_tensor", torch::autograd::autogradNotImplementedFallback());
  m.impl("internal_cast", torch::autograd::autogradNotImplementedFallback());
  m.impl("nop", torch::autograd::autogradNotImplementedFallback());
  m.impl("dynamic_slice", torch::autograd::autogradNotImplementedFallback());
  m.impl("custom_operation", torch::autograd::autogradNotImplementedFallback());
  m.impl("ctc_beam_search_decoder",
         torch::autograd::autogradNotImplementedFallback());
  m.impl("identity_loss", torch::autograd::autogradNotImplementedFallback());
  m.impl("start_for_loop", torch::autograd::autogradNotImplementedFallback());
  m.impl("end_for_loop", torch::autograd::autogradNotImplementedFallback());

  m.impl("start_if_block", torch::autograd::autogradNotImplementedFallback());
  m.impl("start_else_block", torch::autograd::autogradNotImplementedFallback());
  m.impl("end_if_block", torch::autograd::autogradNotImplementedFallback());
  m.impl("fps", torch::autograd::autogradNotImplementedFallback());
  m.impl("nearest", torch::autograd::autogradNotImplementedFallback());
  m.impl("nearest_batch_list",
         torch::autograd::autogradNotImplementedFallback());

  m.impl("optimizer_group", torch::autograd::autogradNotImplementedFallback());
  m.impl("set_matmul_serialization",
         torch::autograd::autogradNotImplementedFallback());
  m.impl("set_overlap_for_input",
         torch::autograd::autogradNotImplementedFallback());
  m.impl("set_overlap_for_output",
         torch::autograd::autogradNotImplementedFallback());
  m.impl("recomputation_checkpoint",
         torch::autograd::autogradNotImplementedFallback());
  m.impl("set_available_memory",
         torch::autograd::autogradNotImplementedFallback());
  m.impl("begin_multi_conv", torch::autograd::autogradNotImplementedFallback());
  m.impl("end_multi_conv", torch::autograd::autogradNotImplementedFallback());
  m.impl("push_name_scope", torch::autograd::autogradNotImplementedFallback());
  m.impl("pop_name_scope", torch::autograd::autogradNotImplementedFallback());
  m.impl("end_cpu_op", torch::autograd::autogradNotImplementedFallback());
  m.impl("call_cpu_op", torch::autograd::autogradNotImplementedFallback());
  m.impl("set_attribute", torch::autograd::autogradNotImplementedFallback());
  m.impl("clear_attribute", torch::autograd::autogradNotImplementedFallback());
}


================================================
FILE: poptorch/source/dispatch_tracer/RegisterMetaOps.cpp.inc
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
#include <cmath>
#include <ATen/MetaFunctions.h>

namespace poptorch::meta {

std::tuple<at::Tensor,at::Tensor> nllLoss2dForward(const at::Tensor &self,
                                                      const at::Tensor &/*target*/,
                                                      const c10::optional<at::Tensor> &/*weight*/,
                                                      int64_t reduction,
                                                      int64_t /*ignore_index*/) {
  // If reduction is none, the shape is the the input without number of
  // classes, which is the second element, i.e. (N, C, ...) to (N, ...)
  // except in the case of a 1D input (C) when it is ().
  std::vector<std::int64_t> shape;
  if (reduction == 0){
    shape = std::vector<std::int64_t>(self.sizes().begin(), self.sizes().end());
    if(shape.size() == 1) {
      shape.clear();
    } else {
      ERROR_ON(shape.size() < 2);
      shape.erase(shape.begin() + 1);
    }
  }
  at::Tensor output = at::meta::empty(shape, self.scalar_type());
  at::Tensor total_weight = at::meta::empty({}, self.scalar_type());
  return {output, total_weight};
}

at::Tensor ctcLoss(const at::Tensor &log_probs, const at::Tensor &/*targets*/,
                      at::IntArrayRef /*input_lengths*/, at::IntArrayRef /*target_lengths*/,
                      int64_t /*blank*/, int64_t reduction, bool /*zero_infinity*/) {
  std::vector<std::int64_t> shape;
  if (reduction == 0 && log_probs.sizes().size() == 3) {
    shape = {log_probs.sizes()[1]};
  }
  return at::meta::empty(shape, log_probs.scalar_type());
}

at::Tensor bincount(const at::Tensor &, const c10::optional<at::Tensor> & weights, int64_t minlength) {

  ERROR_ON_MSG(minlength <= 0, "Bincount `minlength` must be specified and must be a constant. "
                                "On the IPU MK2 platform the minimum length is also the "
                                "maximum length");

  return at::meta::empty({minlength}, weights ? weights->scalar_type() : c10::ScalarType::Int);
}

at::Tensor & bincountOut(const at::Tensor &, const c10::optional<at::Tensor> &, int64_t minlength, at::Tensor & out) {
  ERROR_ON_MSG(minlength <= 0, "Bincount `minlength` must be specified and must be a constant. "
                              "On the IPU MK2 platform the minimum length is also the "
                              "maximum length");

  return out;
}

TORCH_API at::Tensor bucketize(const at::Tensor & self, const at::Tensor &, bool out_int32=false, bool right=false) {
  UNUSED(right);
  UNUSED(out_int32);

  const auto input_shape = self.sizes().vec();
  return at::meta::empty(input_shape, out_int32 ? c10::ScalarType::Int : c10::ScalarType::Long);
}

TORCH_API at::Tensor& hardsigmoidOut(at::Tensor const&, at::Tensor& out) {
  return out;
}

TORCH_API at::Tensor& siluOut(at::Tensor const&, at::Tensor& out) {
  return out;
}

TORCH_API at::Tensor & bucketizeOut(const at::Tensor & , const at::Tensor & , bool , bool , at::Tensor & out) {
  return out;
}

TORCH_API bool equal(const at::Tensor &, const at::Tensor &) {
  return {};
}

torch::Tensor grid(torch::Tensor pos, torch::Tensor size,
                       torch::optional<torch::Tensor> optional_start,
                       torch::optional<torch::Tensor> optional_end) {

  pos = pos.view({pos.size(0), -1});

  ERROR_ON_MSG(size.numel() != pos.size(1), "grid: size.numel() == pos.size(1)");

  if (!optional_start.has_value())
    optional_start = std::get<0>(pos.min(0));
  else
    ERROR_ON_MSG(optional_start.value().numel() != pos.size(1), "grid: optional_start.value().numel() == pos.size(1)");

  if (!optional_end.has_value())
    optional_end = std::get<0>(pos.max(0));
  else
    ERROR_ON_MSG(optional_start.value().numel() != pos.size(1), "grid: optional_start.value().numel() == pos.size(1)");

  auto start = optional_start.value();
  auto end = optional_end.value();

  pos = pos - start.unsqueeze(0);

  auto num_voxels = (end - start).true_divide(size).toType(torch::kLong) + 1;
  num_voxels = num_voxels.cumprod(0);
  num_voxels =
      torch::cat({torch::ones({1}, num_voxels.options()), num_voxels}, 0);
  num_voxels = num_voxels.narrow(0, 0, size.size(0));

  auto out = pos.true_divide(size.view({1, -1})).toType(torch::kLong);
  out *= num_voxels.view({1, -1});
  out = out.sum(1);

  return out;
}

at::Tensor ctcLossTensor(const at::Tensor &log_probs, const at::Tensor &/*targets*/,
                            const at::Tensor &/*input_lengths*/, const at::Tensor &/*target_lengths*/,
                            int64_t /*blank*/, int64_t reduction, bool /*zero_infinity*/) {
  // If no reduction, get the batch size; from docs, this will be
  // `log_probs`' second dimension if it's 3D.
  std::vector<std::int64_t> shape;
  if (reduction == 0 && log_probs.sizes().size() == 3) {
    shape = {log_probs.sizes()[1]};
  }
  return at::meta::empty(shape, log_probs.scalar_type());
}

at::Tensor median(const at::Tensor &self) {
  return at::meta::empty({}, self.scalar_type());
}

std::tuple<at::Tensor,at::Tensor> medianDim(const at::Tensor &self,
                                            int64_t dim, bool keepdim) {
  std::vector<std::int64_t> shape = self.sizes().vec();
  dim = dim < 0 ? dim + self.sizes().size() : dim;

  if (!shape.empty()) {
    if (keepdim) {
      shape[dim] = 1;
    } else {
      shape.erase(shape.begin() + dim);
    }
  }

  auto values = at::meta::empty(shape, self.scalar_type());
  auto indices = at::meta::empty(shape, c10::ScalarType::Long);
  return {values, indices};
}

at::Tensor countNonzero(const at::Tensor &self, at::IntArrayRef dim) {
  auto dim_vec = dim.vec();
  for (auto &d : dim_vec) {
    d = d < 0 ? d + self.sizes().size() : d;
  }

  std::vector<std::int64_t> shape = {1};
  if (dim.size() > 0) {
    shape = self.sizes().vec();
    auto sorted_dims = dim_vec;
    std::sort(sorted_dims.begin(), sorted_dims.end(), std::greater<>{});

    ERROR_ON_MSG(std::adjacent_find(sorted_dims.begin(), sorted_dims.end()) != sorted_dims.end(),
                  "The dimensions to count must be unique");

    for (auto d : sorted_dims) {
      shape.erase(shape.begin() + d);
    }
  }

  return at::meta::empty(shape, self.scalar_type());
}

at::Tensor oneHot(const at::Tensor &self, int64_t num_classes) {
  ERROR_ON_MSG(num_classes == -1, "OneHot num classes must be specified and must be constant.");

  auto shape = self.sizes().vec();
  shape.push_back(num_classes);
  return at::meta::empty(shape, self.scalar_type());
}

at::Tensor upsampleNearest3d(const at::Tensor &input, at::OptionalSymIntArrayRef output_size,
                              c10::optional<at::ArrayRef<double>> scale_factors) {
  ERROR_ON_MSG(!scale_factors && !output_size,
               "Must specify either output_size or scale_factors, but not both.");
  const auto input_shape = input.sizes().vec();
  std::vector<int64_t> actual_output_size;
  if (output_size.has_value()) {
    ERROR_ON_MSG(scale_factors,
                 "Must specify either output_size or scale_factors, but not both.");
    actual_output_size.reserve(output_size->size());
    for (auto i : output_size.value()) {
      actual_output_size.push_back(i.as_int_unchecked());
    }
  }
  else if (scale_factors.has_value()) {
    std::transform(scale_factors->begin(), scale_factors->end(),
                    input_shape.end() - scale_factors->size(),
                    std::back_inserter(actual_output_size),
                    [](double sf, std::int64_t shape) {
                        return static_cast<int64_t>(static_cast<double>(shape) * sf);
                    });
  }

  ERROR_ON_MSG(actual_output_size.size() > input_shape.size(),
              "The number of dimensions of the input (" + std::to_string(input_shape.size()) +
              ") must be more than the number of dimensions in the output (" +
              std::to_string(actual_output_size.size()) + ")");

  std::vector<std::int64_t> shape(input_shape.begin(), input_shape.end() - actual_output_size.size());
  shape.insert(shape.end(), actual_output_size.begin(), actual_output_size.end());
  return at::meta::empty(shape, input.scalar_type());
}

at::Tensor maxPool3d(const at::Tensor &self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode) {
  std::vector<std::int64_t> input_shape = self.sizes().vec();

  ERROR_ON_MSG(input_shape.size() != kernel_size.size() + 1 &&
                input_shape.size() != kernel_size.size() + 2,
                "The kernel size (" << kernel_size.size() <<
                ") must be 1 or 2 less than the input rank ("
                << input_shape.size() << ")");
  ERROR_ON(kernel_size.size() != stride.size());
  ERROR_ON(kernel_size.size() != padding.size());
  ERROR_ON(kernel_size.size() != dilation.size());

  const size_t offset = (input_shape.size() == kernel_size.size() + 1) ? 1 : 2;

  for (auto s = 0u; s < kernel_size.size(); s++) {
    double tmp = (input_shape[offset + s] + 2. * padding[s] - dilation[s] * (kernel_size[s] - 1.) - 1.) / stride[s] + 1.;
    if (ceil_mode) {
      input_shape[offset + s] = std::ceil(tmp);
    } else {
      input_shape[offset + s] = std::floor(tmp);
    }
  }
  return at::meta::empty(input_shape, self.scalar_type());
}

at::Tensor nonzero(const at::Tensor &) {
  ERROR("Operations using aten::nonzero are unsupported because "
        "the output shape is determined by the tensor values. "
        "The IPU cannot support dynamic output shapes.");
}

// torch_scatter
std::tuple<at::Tensor, at::Tensor> scatterMinMax(at::Tensor src,
                                                 at::Tensor /*index*/,
                                                 int64_t dim,
                                                 c10::optional<at::Tensor> out,
                                                 c10::optional<int64_t> dim_size) {
  std::vector<std::int64_t> out_shape = src.sizes().vec();

  dim = dim < 0 ? dim + out_shape.size() : dim;

  if (out) {
    out_shape = out->sizes().vec();
  } else if (dim_size) {
    out_shape[dim] = *dim_size;
  } else {
    ERROR("You must provide either an output parameter or specify dim_size so the output shape may be inferred");
  }

  if (dim_size.has_value()) {
    ERROR_ON_MSG(*dim_size != out_shape[dim], "dim_size = " << *dim_size << " expected to be the same as out.shape()[dim] = " << out_shape[dim] << ", dim = " << dim );
  }

  const auto output = at::meta::empty(out_shape, src.scalar_type());
  const auto argminmax = at::meta::empty(out_shape, c10::ScalarType::Long);
  return {output, argminmax};
}

at::Tensor scatterMul(at::Tensor src,at::Tensor index, int64_t dim, c10::optional<at::Tensor> out, c10::optional<int64_t> dim_size) {
  return std::get<0>(scatterMinMax(src, index, dim, out, dim_size));
}

// torch_cluster
at::Tensor fps(const torch::Tensor &src, const std::vector<std::int64_t> &ptr,
               double ratio, bool /*random_start*/) {
  const auto dim = src.dim();
  const auto ptr_size = ptr.size();
  const auto src_size = src.size(0);
  ERROR_ON_MSG(ratio <= 0.0 || ratio > 1.0,
               "`ratio` (" << ratio << ") has to be in range (0.0, 1.0>");

  ERROR_ON_MSG(dim != 2,
               "`src` is supposed to be 2d Tensor, while it has " << dim
                                                                  << " dims");
  ERROR_ON_MSG(
      ptr_size < 2 || ptr_size > static_cast<size_t>(src_size) + 1,
      "`ptr` length (" << ptr_size << ") is supposed to be < src.size(0) ("
                       << src_size << ")");

  std::int64_t out_size = 0;
  for (size_t i = 1; i < ptr_size; i++)
    out_size += std::ceil(static_cast<float>(ptr[i] - ptr[i - 1]) * ratio);

  return at::meta::empty({out_size}, c10::ScalarType::Int);
}

// torch_spline_conv
std::tuple<at::Tensor, at::Tensor> splineBasis(at::Tensor pseudo,
                                                 at::Tensor /*kernel_size*/,
                                                 at::Tensor /*is_open_spline*/,
                                                 int64_t degree) {
  const std::vector<std::int64_t> in_shape = pseudo.sizes().vec();
  const std::int64_t numEdges = in_shape[0];
  const std::int64_t numDims = in_shape[1];
  const std::int64_t numSplines = std::pow(degree + 1, numDims) + 0.5;
  const std::vector<std::int64_t> out_shape({numEdges, numSplines});

  const auto basis = at::meta::empty(out_shape, pseudo.scalar_type());
  const auto weightIndex = at::meta::empty(out_shape, c10::ScalarType::Int);
  return {basis, weightIndex};
}

at::Tensor splineWeighting(at::Tensor input,
                           at::Tensor weight,
                           at::Tensor /*basis*/,
                           at::Tensor /*weight_index*/) {
  const std::vector<std::int64_t> in_shape = input.sizes().vec();
  const std::vector<std::int64_t> w_shape = weight.sizes().vec();
  const std::vector<std::int64_t> out_shape({in_shape[0], w_shape[2]});

  const auto output = at::meta::empty(out_shape, input.scalar_type());
  return output;
}

at::Tensor nearest(const torch::Tensor &x, const torch::Tensor &,
                   const torch::Tensor &, const torch::Tensor &) {
  return at::meta::empty({x.sizes().front()}, c10::ScalarType::Int);
}

at::Tensor nearest_batch_list(const torch::Tensor &x, const torch::Tensor &,
                              const std::vector<std::int64_t> &,
                              const std::vector<std::int64_t> &) {
  return at::meta::empty({x.sizes().front()}, c10::ScalarType::Int);
}

// poptorch

// dynamic_slice(Tensor self, int dim, Tensor start, int size, int step) -> Tensor
at::Tensor dynamicSlice(const at::Tensor &self, int64_t dim, const at::Tensor &/*start*/,
                        int64_t size, int64_t step) {
  auto shape = self.sizes().vec();
  shape[dim] = (size + (step - 1)) / step;
  return at::meta::empty(shape, self.scalar_type());
}

// dynamic_update(Tensor self, Tensor src, int dim, Tensor start, int size, int step) -> Tensor
at::Tensor dynamicUpdate(const at::Tensor &self, const at::Tensor & /*src*/,
                         int64_t /*dim*/, const at::Tensor & /*start*/,
                         int64_t /*size*/) {
  auto shape = self.sizes().vec();
  return at::meta::empty(shape, self.scalar_type());
}

// custom_operation(Tensor[] inputs, str name, str domain, int domain_version, int num_outputs, Tensor(a!)[] outputs, str attributes) -> Tensor(a!)[]
std::vector<at::Tensor> customOperation(const std::vector<at::Tensor> &/*inputs*/,
                                        const std::string &/*name*/,
                                        const std::string &/*domain*/,
                                        int64_t /*domain_version*/,
                                        int64_t /*num_outputs*/,
                                        const std::vector<at::Tensor> &outputs,
                                        const std::string &/*attributes*/) {
  std::vector<at::Tensor> ret;
  for (const auto &t : outputs) {
    ret.push_back(at::meta::empty(t.sizes(), t.scalar_type()));
  }
  return ret;
}

at::Tensor & tanh_backward_out(const at::Tensor& , const at::Tensor &, at::Tensor & grad_input) {
  return grad_input;
}

std::tuple<at::Tensor, at::Tensor, at::Tensor> native_layer_norm_backward(
    const at::Tensor&,
    const at::Tensor& input,
    at::IntArrayRef ,
    const at::Tensor&,
    const at::Tensor&,
    const c10::optional<at::Tensor>& weight,
    const c10::optional<at::Tensor>& bias,
    ::std::array<bool, 3> output_mask) {

  return { at::meta::empty(output_mask[0] ? input.sizes().vec() : std::vector<int64_t>{}, input.scalar_type()),  at::meta::empty(output_mask[1] && weight ? weight->sizes().vec()
                               : std::vector<int64_t>{}, weight && weight->defined() ? weight->scalar_type() : input.scalar_type()),  at::meta::empty(output_mask[2] && bias ? bias->sizes().vec() : std::vector<int64_t>{}, bias && weight->defined() ? bias->scalar_type() : input.scalar_type()) };
}

// ctc_beam_search_decoder(Tensor probs, Tensor lengths, int blank, int beam_width, int top_paths) -> (Tensor, Tensor, Tensor)
std::tuple<at::Tensor, at::Tensor, at::Tensor>
ctcBeamSearchDecoder(const at::Tensor &probs, const at::Tensor &/*lengths*/,
                      int64_t /*blank*/, int64_t /*beam_width*/, int64_t top_paths) {
  ERROR_ON_MSG(probs.sizes().size() != 3,
              "Input probablities tensor must be rank-3 for "
              "`ctc_beam_search_decoder`.");
  const auto input_size = probs.sizes()[0];
  const auto batch_size = probs.sizes()[1];
  auto out_probs = at::meta::empty({batch_size, top_paths}, probs.scalar_type());
  auto out_paths = at::meta::empty({batch_size, top_paths, input_size}, probs.scalar_type());
  return {out_probs, out_probs, out_paths};
}

// identity_loss(Tensor x, str reduction) -> Tensor
at::Tensor identityLoss(const at::Tensor &x, int64_t reduction) {
  constexpr int64_t sum = 0;
  constexpr int64_t mean = 1;
  constexpr int64_t none = 2;
  std::vector<int64_t> sizes;
  switch (reduction) {
  case sum:
  case mean:
    break;
  case none:
    sizes = x.sizes().vec();
    break;
  default:
    ERROR("reduction must be sum (0), mean (1) or none (2)");
  }
  return at::meta::empty(sizes, x.scalar_type());
}

void opWithoutOutputs(const c10::OperatorHandle &/*op*/, c10::Stack *stack) {
  stack->clear();
}

void opReturningFirstArgument(const c10::OperatorHandle &/*op*/, c10::Stack *stack) {
  stack->erase(stack->begin() + 1, stack->end());
}
} // namespace poptorch::meta

TORCH_LIBRARY_IMPL(aten, Meta, m) {
  m.impl("bincount", PTC(poptorch::meta::bincount));
  m.impl("bincount.out", PTC(poptorch::meta::bincountOut));
  m.impl("bucketize.Tensor", PTC(poptorch::meta::bucketize));
  m.impl("bucketize.Tensor_out", PTC(poptorch::meta::bucketizeOut));
  m.impl("equal", PTC(poptorch::meta::equal));
  m.impl("hardsigmoid.out", PTC(poptorch::meta::hardsigmoidOut));
  m.impl("rrelu_with_noise", PTC_BOXED(poptorch::meta::opReturningFirstArgument));
  m.impl("count_nonzero.dim_IntList", PTC(poptorch::meta::countNonzero));
  m.impl("ctc_loss.Tensor", PTC(poptorch::meta::ctcLossTensor));
  m.impl("ctc_loss.IntList", PTC(poptorch::meta::ctcLoss));
  m.impl("max_pool3d", PTC(poptorch::meta::maxPool3d));
  m.impl("median", PTC(poptorch::meta::median));
  m.impl("median.dim", PTC(poptorch::meta::medianDim));
  m.impl("nll_loss2d_forward", PTC(poptorch::meta::nllLoss2dForward));
  m.impl("nonzero", PTC(poptorch::meta::nonzero));
  m.impl("one_hot", PTC(poptorch::meta::oneHot));
  m.impl("silu.out", PTC(poptorch::meta::siluOut));
  m.impl("upsample_nearest3d.vec", PTC(poptorch::meta::upsampleNearest3d));
  m.impl("tanh_backward.grad_input", PTC(poptorch::meta::tanh_backward_out));
  m.impl("native_layer_norm_backward", PTC(poptorch::meta::native_layer_norm_backward));
}


TORCH_LIBRARY_IMPL(torch_scatter, Meta, m) {
  m.impl("scatter_max", PTC(poptorch::meta::scatterMinMax));
  m.impl("scatter_min", PTC(poptorch::meta::scatterMinMax));
  m.impl("scatter_mul", PTC(poptorch::meta::scatterMul));
}

TORCH_LIBRARY_IMPL(torch_cluster, Meta, m) {
  m.impl("grid", PTC(poptorch::meta::grid));
}

TORCH_LIBRARY_IMPL(torch_spline_conv, Meta, m) {
  m.impl("spline_basis", PTC(poptorch::meta::splineBasis));
  m.impl("spline_weighting", PTC(poptorch::meta::splineWeighting));
}

TORCH_LIBRARY_IMPL(poptorch, Meta, m) {
  m.impl("push_name_scope", PTC_BOXED(poptorch::meta::opWithoutOutputs));
  m.impl("pop_name_scope", PTC_BOXED(poptorch::meta::opWithoutOutputs));
  m.impl("begin_ipu_block", PTC_BOXED(poptorch::meta::opWithoutOutputs));
  m.impl("end_ipu_block", PTC_BOXED(poptorch::meta::opWithoutOutputs));
  m.impl("start_for_loop", PTC_BOXED(poptorch::meta::opWithoutOutputs));
  m.impl("start_if_block", PTC_BOXED(poptorch::meta::opWithoutOutputs));
  m.impl("start_else_block", PTC_BOXED(poptorch::meta::opWithoutOutputs));
  m.impl("optimizer_group", PTC_BOXED(poptorch::meta::opWithoutOutputs));
  m.impl("call_cpu_op", PTC_BOXED(poptorch::meta::opWithoutOutputs));
  m.impl("set_attribute", PTC_BOXED(poptorch::meta::opWithoutOutputs));
  m.impl("clear_attribute", PTC_BOXED(poptorch::meta::opWithoutOutputs));
  m.impl("begin_multi_conv", PTC_BOXED(poptorch::meta::opWithoutOutputs));
  m.impl("end_multi_conv", PTC_BOXED(poptorch::meta::opWithoutOutputs));

  m.impl("end_cpu_op", PTC_BOXED(poptorch::meta::opReturningFirstArgument));
  m.impl("end_for_loop", PTC_BOXED(poptorch::meta::opReturningFirstArgument));
  m.impl("end_if_block", PTC_BOXED(poptorch::meta::opReturningFirstArgument));
  m.impl("internal_cast", PTC_BOXED(poptorch::meta::opReturningFirstArgument));
  m.impl("ipu_print_tensor", PTC_BOXED(poptorch::meta::opReturningFirstArgument));
  m.impl("nop", PTC_BOXED(poptorch::meta::opReturningFirstArgument));
  m.impl("recomputation_checkpoint", PTC_BOXED(poptorch::meta::opReturningFirstArgument));
  m.impl("set_available_memory", PTC_BOXED(poptorch::meta::opReturningFirstArgument));
  m.impl("set_matmul_serialization", PTC_BOXED(poptorch::meta::opReturningFirstArgument));
  m.impl("set_overlap_for_input", PTC_BOXED(poptorch::meta::opReturningFirstArgument));
  m.impl("set_overlap_for_output", PTC_BOXED(poptorch::meta::opReturningFirstArgument));

  m.impl("ctc_beam_search_decoder", PTC(poptorch::meta::ctcBeamSearchDecoder));
  m.impl("custom_operation", PTC(poptorch::meta::customOperation));
  m.impl("dynamic_slice", PTC(poptorch::meta::dynamicSlice));
  m.impl("dynamic_update", PTC(poptorch::meta::dynamicUpdate));
  m.impl("identity_loss", PTC(poptorch::meta::identityLoss));
  m.impl("fps", PTC(poptorch::meta::fps));
  m.impl("nearest", PTC(poptorch::meta::nearest));
  m.impl("nearest_batch_list", PTC(poptorch::meta::nearest_batch_list));
}

TORCH_LIBRARY_IMPL(poptorch, AutogradMeta, m) {
  m.impl("begin_ipu_block", torch::autograd::autogradNotImplementedFallback());
  m.impl("end_ipu_block", torch::autograd::autogradNotImplementedFallback());
  m.impl("ipu_print_tensor", torch::autograd::autogradNotImplementedFallback());
  m.impl("internal_cast", torch::autograd::autogradNotImplementedFallback());
  m.impl("nop", torch::autograd::autogradNotImplementedFallback());
  m.impl("dynamic_slice", torch::autograd::autogradNotImplementedFallback());
  m.impl("dynamic_update", torch::autograd::autogradNotImplementedFallback());
  m.impl("custom_operation", torch::autograd::autogradNotImplementedFallback());
  m.impl("ctc_beam_search_decoder",
         torch::autograd::autogradNotImplementedFallback());
  m.impl("identity_loss", torch::autograd::autogradNotImplementedFallback());
  m.impl("start_for_loop", torch::autograd::autogradNotImplementedFallback());
  m.impl("end_for_loop", torch::autograd::autogradNotImplementedFallback());
  m.impl("start_if_block", torch::autograd::autogradNotImplementedFallback());
  m.impl("start_else_block", torch::autograd::autogradNotImplementedFallback());
  m.impl("end_if_block", torch::autograd::autogradNotImplementedFallback());

  m.impl("optimizer_group", torch::autograd::autogradNotImplementedFallback());
  m.impl("set_matmul_serialization",
         torch::autograd::autogradNotImplementedFallback());
  m.impl("set_overlap_for_input",
         torch::autograd::autogradNotImplementedFallback());
  m.impl("set_overlap_for_output",
         torch::autograd::autogradNotImplementedFallback());
  m.impl("recomputation_checkpoint",
         torch::autograd::autogradNotImplementedFallback());
  m.impl("set_available_memory",
         torch::autograd::autogradNotImplementedFallback());
  m.impl("begin_multi_conv", torch::autograd::autogradNotImplementedFallback());
  m.impl("end_multi_conv", torch::autograd::autogradNotImplementedFallback());
  m.impl("push_name_scope", torch::autograd::autogradNotImplementedFallback());
  m.impl("pop_name_scope", torch::autograd::autogradNotImplementedFallback());
  m.impl("end_cpu_op", torch::autograd::autogradNotImplementedFallback());
  m.impl("call_cpu_op", torch::autograd::autogradNotImplementedFallback());
  m.impl("set_attribute", torch::autograd::autogradNotImplementedFallback());
  m.impl("clear_attribute", torch::autograd::autogradNotImplementedFallback());
  m.impl("fps", torch::autograd::autogradNotImplementedFallback());
  m.impl("nearest", torch::autograd::autogradNotImplementedFallback());
  m.impl("nearest_batch_list", torch::autograd::autogradNotImplementedFallback());
}

// For some reason these operations are first dispatched to AutogradMeta,
// so we ignore and allow them pass through to Meta
TORCH_LIBRARY_IMPL(aten, AutogradMeta, m) {
  m.impl("ctc_loss.Tensor", torch::autograd::autogradNotImplementedFallback());
  m.impl("ctc_loss.IntList", torch::autograd::autogradNotImplementedFallback());
  m.impl("max_pool3d", torch::autograd::autogradNotImplementedFallback());
  m.impl("one_hot", torch::autograd::autogradNotImplementedFallback());
  m.impl("bucketize.Tensor", torch::autograd::autogradNotImplementedFallback());
  m.impl("bucketize.Tensor_out", torch::autograd::autogradNotImplementedFallback());
  m.impl("bucketize.Scalar", torch::autograd::autogradNotImplementedFallback());
}

TORCH_LIBRARY_IMPL(torch_scatter, AutogradMeta, m) {
  m.impl("scatter_max", torch::autograd::autogradNotImplementedFallback());
  m.impl("scatter_min", torch::autograd::autogradNotImplementedFallback());
  m.impl("scatter_mul", torch::autograd::autogradNotImplementedFallback());
}

TORCH_LIBRARY_IMPL(torch_cluster, AutogradMeta, m) {
  m.impl("grid", torch::autograd::autogradNotImplementedFallback());
}

TORCH_LIBRARY_IMPL(torch_spline_conv, AutogradMeta, m) {
  m.impl("spline_basis", torch::autograd::autogradNotImplementedFallback());
  m.impl("spline_weighting", torch::autograd::autogradNotImplementedFallback());
}


================================================
FILE: poptorch/source/dispatch_tracer/RegisterOptionalAtenOps.cpp.inc
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.

TORCH_LIBRARY_IMPL(aten, IPU, m) {
  // These ops otherwise require direct access to the storage of an
  // `IpuTensorImpl`, so we must implement them ourselves.
  m.impl("alias", PTC_BOXED(poptorch::fallback));

  m.impl("bucketize.Tensor", PTC_BOXED(poptorch::fallback));
  m.impl("bucketize.Tensor_out", PTC_BOXED(poptorch::fallback));
  m.impl("bucketize.Scalar", PTC_BOXED(poptorch::fallback));

  m.impl("copy_", PTC_BOXED(poptorch::copyInplace));
  m.impl("detach", PTC_BOXED(poptorch::detach));
  m.impl("_local_scalar_dense", PTC(poptorch::localScalarDense));
  m.impl("item", PTC(poptorch::item));

  m.impl("empty.memory_format", PTC(poptorch::emptyMemoryFormat));
  m.impl("empty_strided", PTC(poptorch::emptyStrided));

  m.impl("_weight_norm_interface", PTC_BOXED(poptorch::weightNormInterface));

  m.impl("index.Tensor", PTC_BOXED(poptorch::fallback));

  m.impl("convolution", PTC_BOXED(poptorch::fallback));
  m.impl("convolution_backward", PTC_BOXED(poptorch::fallback));

  // These ops must be intercepted so that meta type inference
  // doesn't have to deal with "out" tensors that aren't directly
  // assigned to
  m.impl("median.dim", PTC_BOXED(poptorch::fallback));
  m.impl("min.dim", PTC_BOXED(poptorch::fallback));
  m.impl("max.dim", PTC_BOXED(poptorch::fallback));
  m.impl("topk", PTC_BOXED(poptorch::fallback));
  m.impl("nll_loss_forward", PTC_BOXED(poptorch::fallback));
  m.impl("nll_loss2d_forward", PTC_BOXED(poptorch::fallback));

  m.impl("transpose.int", PTC_BOXED(poptorch::fallback));
  m.impl("expand", PTC_BOXED(poptorch::fallback));
  m.impl("_unsafe_view", PTC_BOXED(poptorch::fallback));
  m.impl("gather", PTC_BOXED(poptorch::fallback));
  m.impl("dropout", PTC_BOXED(poptorch::fallback));
  m.impl("avg_pool2d.out", PTC_BOXED(poptorch::fallback));
  m.impl("avg_pool3d.out", PTC_BOXED(poptorch::fallback));
  m.impl("max_pool1d", PTC_BOXED(poptorch::fallback));
  m.impl("max_pool2d", PTC_BOXED(poptorch::fallback));
  m.impl("max_pool3d", PTC_BOXED(poptorch::fallback));
  m.impl("adaptive_avg_pool1d", PTC_BOXED(poptorch::fallback));
  m.impl("adaptive_avg_pool2d", PTC_BOXED(poptorch::fallback));
  m.impl("adaptive_avg_pool3d", PTC_BOXED(poptorch::fallback));
  m.impl("trunc", PTC_BOXED(poptorch::fallback));
  m.impl("min", PTC_BOXED(poptorch::fallback));
  m.impl("amin", PTC_BOXED(poptorch::fallback));
  m.impl("minimum", PTC_BOXED(poptorch::fallback));
  m.impl("max", PTC_BOXED(poptorch::fallback));
  m.impl("amax", PTC_BOXED(poptorch::fallback));
  m.impl("maximum", PTC_BOXED(poptorch::fallback));
  m.impl("argsort", PTC_BOXED(poptorch::fallback));
  m.impl("one_hot", PTC_BOXED(poptorch::fallback));
  m.impl("all", PTC_BOXED(poptorch::fallback));
  m.impl("any", PTC_BOXED(poptorch::fallback));
  m.impl("feature_dropout", PTC_BOXED(poptorch::fallback));
  m.impl("feature_dropout_", PTC_BOXED(poptorch::fallback));
  m.impl("embedding", PTC_BOXED(poptorch::fallback));

  // Needed due to "CompositeImplicitAutograd"
  m.impl("native_group_norm",
         torch::CppFunction::makeFromBoxedFunction<&poptorch::fallback>());
  m.impl("native_layer_norm",
         torch::CppFunction::makeFromBoxedFunction<&poptorch::fallback>());
  m.impl("lstm.input",
          torch::CppFunction::makeFromBoxedFunction<&poptorch::fallback>());

  // If we don't intercept these ops, they will be decomposed into
  // as_strided which is harder to handle.
  m.impl("slice.Tensor", PTC_BOXED(poptorch::fallback));
  m.impl("squeeze", PTC_BOXED(poptorch::fallback));
  m.impl("squeeze_", PTC_BOXED(poptorch::fallback));
  m.impl("squeeze.dim", PTC_BOXED(poptorch::fallback));
  m.impl("squeeze_.dim", PTC_BOXED(poptorch::fallback));
  m.impl("squeeze.dims", PTC_BOXED(poptorch::fallback));
  m.impl("squeeze_.dims", PTC_BOXED(poptorch::fallback));
  m.impl("unsqueeze", PTC_BOXED(poptorch::fallback));
  m.impl("permute", PTC_BOXED(poptorch::fallback));
  m.impl("select.int", PTC_BOXED(poptorch::fallback));
  m.impl("transpose_", PTC_BOXED(poptorch::fallback));
  m.impl("split_with_sizes", PTC_BOXED(poptorch::fallback));

  // If we don't intercept this op, it will be decomposed into
  // _index_put_impl_, which exposes unnecessary implementation
  // details
  m.impl("index_put_", PTC_BOXED(poptorch::fallback));
  // If we don't intercept this op, it will be converted into a clone followed
  // by an index_put_, which is inefficient in eager mode
  m.impl("index_put", PTC_BOXED(poptorch::fallback));
  // If we don't intercept this op, it will be converted into a clone followed
  // by an baddbmm.out, which is inefficient in eager mode
  m.impl("baddbmm", PTC_BOXED(poptorch::fallback));
  // If we don't intercept this op, it will be converted into a clone followed
  // by an masked_fill_.Scalar, which is inefficient in eager mode
  m.impl("masked_fill.Scalar", PTC_BOXED(poptorch::fallback));

  // If we don't catch these, PyTorch will try to call aten::resize_ on the
  // result which is not supported.
  m.impl("frobenius_norm.out", PTC_BOXED(poptorch::fallback));
  m.impl("frobenius_norm.dim", PTC_BOXED(poptorch::fallback));

  // Use our own repeat op
  m.impl("repeat", PTC_BOXED(poptorch::fallback));

  m.impl("constant_pad_nd", PTC_BOXED(poptorch::fallback));
  m.impl("binary_cross_entropy_with_logits", PTC_BOXED(poptorch::fallback));
  m.impl("binary_cross_entropy_with_logits_backward", PTC_BOXED(poptorch::fallback));

  // If we don't catch it here, PyTorch will decompose bilinear into an enormous
  // number of ops, which will result in an all-zeros output.
  m.impl("bilinear", PTC_BOXED(poptorch::fallback));

  // Loss functions: these are needed for popart, so that we can mark the loss
  // tensor (see `IsLoss`); otherwise, the op will get decomposed by PyTorch.
  m.impl("cosine_embedding_loss", PTC_BOXED(poptorch::fallback));
  m.impl("ctc_loss.IntList", PTC_BOXED(poptorch::fallback));
  m.impl("ctc_loss.Tensor", PTC_BOXED(poptorch::fallback));
  m.impl("hinge_embedding_loss", PTC_BOXED(poptorch::fallback));
  m.impl("kl_div", PTC_BOXED(poptorch::fallback));
  m.impl("l1_loss", PTC_BOXED(poptorch::fallback));
  m.impl("margin_ranking_loss", PTC_BOXED(poptorch::fallback));
  m.impl("poisson_nll_loss", PTC_BOXED(poptorch::fallback));
  m.impl("soft_margin_loss.out", PTC_BOXED(poptorch::fallback));
  m.impl("triplet_margin_loss", PTC_BOXED(poptorch::fallback));
  m.impl("mse_loss", PTC_BOXED(poptorch::fallback));
  m.impl("smooth_l1_loss", PTC_BOXED(poptorch::fallback));

  // Scatter: By default, PyTorch's handler will fail if the index tensor isn't
  // a tensor of int64s (see `scatter_gather_dtype_check` in PyTorch) -- ours
  // will have been coerced to int32s.
  m.impl("scatter.src", PTC_BOXED(poptorch::fallback));
  m.impl("scatter.src_out", PTC_BOXED(poptorch::fallback));
  m.impl("scatter_.src", PTC_BOXED(poptorch::fallback));

  m.impl("scatter.value", PTC_BOXED(poptorch::fallback));
  m.impl("scatter.value_out", PTC_BOXED(poptorch::fallback));
  m.impl("scatter_.value", PTC_BOXED(poptorch::fallback));

  m.impl("scatter.reduce", PTC_BOXED(poptorch::fallback));
  m.impl("scatter.reduce_out", PTC_BOXED(poptorch::fallback));
  m.impl("scatter_.reduce", PTC_BOXED(poptorch::fallback));

  m.impl("scatter.value_reduce", PTC_BOXED(poptorch::fallback));
  m.impl("scatter.value_reduce_out", PTC_BOXED(poptorch::fallback));
  m.impl("scatter_.value_reduce", PTC_BOXED(poptorch::fallback));

  m.impl("scatter_add", PTC_BOXED(poptorch::fallback));
  m.impl("scatter_add.out", PTC_BOXED(poptorch::fallback));
  m.impl("scatter_add_", PTC_BOXED(poptorch::fallback));

  m.impl("scatter_reduce.two", PTC_BOXED(poptorch::fallback));
  m.impl("scatter_reduce.two_out", PTC_BOXED(poptorch::fallback));
  m.impl("scatter_reduce_.two", PTC_BOXED(poptorch::fallback));

  m.impl("select_scatter", PTC_BOXED(poptorch::fallback));
  m.impl("select_scatter.out", PTC_BOXED(poptorch::fallback));

  m.impl("_prelu_kernel", PTC_BOXED(poptorch::fallback));

  m.impl("take_along_dim", PTC_BOXED(poptorch::fallback));
  m.impl("take_along_dim.out", PTC_BOXED(poptorch::fallback));
}

TORCH_LIBRARY_IMPL(poptorch, IPU, m) {
  m.impl("fps", PTC_BOXED(poptorch::fallback));
  m.impl("nearest", PTC_BOXED(poptorch::fallback));
  m.impl("nearest_batch_list", PTC_BOXED(poptorch::fallback));
}

TORCH_LIBRARY_IMPL(torch_scatter, IPU, m) {
  m.impl("scatter_max", PTC_BOXED(poptorch::fallback));
  m.impl("scatter_min", PTC_BOXED(poptorch::fallback));
  m.impl("scatter_mul", PTC_BOXED(poptorch::fallback));
}

TORCH_LIBRARY_IMPL(torch_cluster, IPU, m) {
  m.impl("grid", PTC_BOXED(poptorch::fallback));
}

TORCH_LIBRARY_IMPL(torch_spline_conv, IPU, m) {
  m.impl("spline_basis", PTC_BOXED(poptorch::fallback));
  m.impl("spline_weighting", PTC_BOXED(poptorch::fallback));
}


================================================
FILE: poptorch/source/dispatch_tracer/Tensor.cpp
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
#include "Tensor.hpp"

#include <ATen/ATen.h>
#include <ATen/OpaqueTensorImpl.h>
#include <c10/core/ScalarType.h>

#include <algorithm>
#include <functional>
#include <iterator>
#include <memory>
#include <numeric>
#include <string>
#include <variant>
#include <vector>

#include "CommonHelperFunctions.hpp"
#include "ValueMapper.hpp"

#include "poptorch/DispatchTracer.hpp"

#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"
#include "pytorch_bridge/CompilerTypes.hpp"
#include "pytorch_bridge/IpuSession.hpp"

namespace poptorch {

namespace {

using BufferPtr = std::shared_ptr<Buffer>;
using TensorViewPtr = std::shared_ptr<ITensorView>;

template <class... Ts> struct Overloaded : Ts... { using Ts::operator()...; };
template <class... Ts> Overloaded(Ts...) -> Overloaded<Ts...>;

std::shared_ptr<IpuTensorDetails>
getTensorDetails(const at::TensorImpl &ipu_tensor);

// This is just a useful helper since sometimes we need to pass both keys in.
c10::DispatchKeySet dispatch_key_set{c10::DispatchKey::IPU,
                                     c10::DispatchKey::AutogradIPU};

} // namespace

poptorch_ir::Type toCompilerType(const at::ScalarType &elem_type) {
  switch (elem_type) {
  case at::ScalarType::Bool:
    return poptorch_ir::Type::BOOL;
  case at::ScalarType::Byte:
    return poptorch_ir::Type::UNSIGNED_CHAR;
  case at::ScalarType::Char:
    return poptorch_ir::Type::CHAR;
  case at::ScalarType::Float:
  case at::ScalarType::Double: // We will convert this.
    return poptorch_ir::Type::FLOAT;
  case at::ScalarType::Half:
    return poptorch_ir::Type::HALF;
  case at::ScalarType::Short:
    return poptorch_ir::Type::SHORT;
  case at::ScalarType::Int:
  case at::ScalarType::Long: // We will convert this.
    return poptorch_ir::Type::INT;
  default:
    ERROR("Unsupported tensor input type from pytorch: " << elem_type);
  }
}

poptorch_ir::Type toCompilerElementType(const at::Tensor &tensor) {
  auto dtype = tensor.dtype();
  return toCompilerType(dtype.toScalarType());
}

// Return the data size in bytes of a tensor (i.e num_elems * elem_size)
uint64_t tensorImplDataSize(const at::TensorImpl &impl) {
  auto shape = impl.sizes();
  const std::int64_t nelems = std::accumulate(shape.begin(), shape.end(), 1,
                                              std::multiplies<std::int64_t>());
  const auto elem_size = impl.itemsize();
  return nelems * elem_size;
}

// This is our own TensorImpl: this is stored in every at::Tensor of type IPU.
//
// This implementation is inspired by VulkanOpaqueTensorImpl / OpaqueTensorImpl:
// they seem to have similar needs to us.
struct IpuTensorImpl : public at::TensorImpl {
  // Shallow copy constructor (Both instances will share the same host buffer if
  // it exists). Shouldn't be called directly: use shallow_copy_and_detach()
  // instead.
  IpuTensorImpl(const IpuTensorImpl &src)
      : IpuTensorImpl(src.dtype(), src.device(),
                      src.sizes_and_strides_.sizes_arrayref(),
                      src.sizes_and_strides_.strides_arrayref(), src.details) {}

  void release_resources() override { details.reset(); }

  IpuTensorImpl(const caffe2::TypeMeta data_type, c10::Device device,
                c10::IntArrayRef sizes, c10::IntArrayRef strides,
                const std::shared_ptr<IpuTensorDetails> &details_)
      : at::TensorImpl(dispatch_key_set, data_type, device), details(details_) {
    // set_sizes must be called before stride_at because it resizes the
    // array that stores both sizes and strides.
    sizes_and_strides_.set_sizes(sizes);
    for (uint dim = 0; dim < strides.size(); ++dim) {
      sizes_and_strides_.stride_at(dim) = strides.at(dim);
    }

    set_storage_access_should_throw();
    set_custom_sizes_strides(at::TensorImpl::SizesStridesPolicy::Default);
    is_non_overlapping_and_dense_ = false;
    refresh_numel();
  }

  c10::intrusive_ptr<TensorImpl>
  shallow_copy_and_detach(const c10::VariableVersion &version_counter,
                          bool allow_tensor_metadata_change) const override {
    auto impl = c10::make_intrusive<IpuTensorImpl>(*this);
    copy_tensor_metadata(
        /*src_impl=*/this,
        /*dest_impl=*/impl.get(),
        /*version_counter=*/version_counter,
        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
    impl->refresh_numel();

    return impl;
  }

  c10::intrusive_ptr<TensorImpl>
  shallow_copy_and_detach(c10::VariableVersion &&version_counter,
                          bool allow_tensor_metadata_change) const override {
    auto impl = c10::make_intrusive<IpuTensorImpl>(*this);
    copy_tensor_metadata(
        /*src_impl=*/this,
        /*dest_impl=*/impl.get(),
        /*version_counter=*/version_counter,
        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
    impl->refresh_numel();

    return impl;
  }

  void set_size(int64_t dim, int64_t new_size) override {
    UNUSED(dim);
    UNUSED(new_size);
    AT_ERROR("IPU tensors do not have set_size");
  }

  void set_stride(int64_t dim, int64_t new_stride) override {
    UNUSED(dim);
    UNUSED(new_stride);
    AT_ERROR("IPU tensors do not have set_stride");
  }

  void set_storage_offset(int64_t storage_offset) override {
    UNUSED(storage_offset);
    AT_ERROR("IPU tensors do not have set_storage_offset");
  }

  std::shared_ptr<IpuTensorDetails> details;

private:
  const char *tensorimpl_type_name() const override { return "IpuTensorImpl"; }
};

namespace {

IpuTensorImpl *tryIpuTensorImpl(const at::Tensor &tensor) {
  return dynamic_cast<IpuTensorImpl *>(tensor.unsafeGetTensorImpl());
}

IpuTensorImpl *toIpuTensorImpl(const at::Tensor &tensor) {
  auto *ptr = tryIpuTensorImpl(tensor);
  ERROR_ON_MSG(ptr == nullptr,
               "Expected an IPU tensor but "
                   << tensor.unsafeGetTensorImpl() << " is "
                   << tensor.unsafeGetTensorImpl()->device_type());
  return ptr;
}

const IpuTensorImpl *toIpuTensorImpl(const at::TensorImpl &tensor) {
  const auto *impl = dynamic_cast<const IpuTensorImpl *>(&tensor);
  ERROR_ON_MSG(impl == nullptr, "Expected an IPU tensor but "
                                    << &tensor << " is "
                                    << tensor.device_type());
  return impl;
}

std::shared_ptr<IpuTensorDetails>
getTensorDetails(const at::TensorImpl &ipu_tensor) {
  return toIpuTensorImpl(ipu_tensor)->details;
}

// TODO(T61601) Create a proper implementation of GuardImpl
struct GuardImpl : public c10::impl::DeviceGuardImplInterface {
  at::DeviceType type() const override { return at::DeviceType::IPU; }

  c10::Device exchangeDevice(c10::Device device) const override {
    logging::trace("exchangeDevice: current {} new {}", _current_device,
                   device);
    c10::Device old = _current_device;
    *const_cast<c10::Device *>(&_current_device) = device;
    return old;
  }

  // Called by the dispatcher every time the user passes a device type without
  // an ID to a "to()" method For example: my_tensor.to(torch.device("ipu"))
  c10::Device getDevice() const override { return _current_device; }

  void setDevice(c10::Device device) const override {
    logging::trace("setDevice: current {} new {}", _current_device, device);
    *const_cast<c10::Device *>(&_current_device) = device;
  }

  void uncheckedSetDevice(c10::Device device) const noexcept override {
    logging::trace("uncheckedSetDevice: current {} new {}", _current_device,
                   device);
    *const_cast<c10::Device *>(&_current_device) = device;
  }

  // Used by the autograd.
  // Streams are essentially command queues: if kernels A & B are added to the
  // same stream, A is guaranteed to have completed before B starts.
  // For A & B to be run in parallel they need to be added to different
  // streams.
  c10::Stream getStream(c10::Device device) const noexcept override {
    return c10::Stream(c10::Stream::DEFAULT, device);
  }

  c10::Stream exchangeStream(c10::Stream s) const noexcept override {
    UNUSED(s);
    return c10::Stream(c10::Stream::DEFAULT, getDevice());
  }

  // Used by torch::autograd::Engine::initialize_device_threads_pool
  c10::DeviceIndex deviceCount() const noexcept override { return 1; }

private:
  c10::Device _current_device{at::DeviceType::IPU, 0};
};

C10_REGISTER_GUARD_IMPL(IPU, GuardImpl)

poptorch_ir::TensorType getTensorType(const at::ScalarType &scalar_type,
                                      std::vector<std::int64_t> sizes) {
  return {std::move(sizes), toCompilerType(scalar_type)};
}
} // namespace

poptorch_ir::TensorType getTensorType(const at::Tensor &tensor) {
  return getTensorType(tensor.scalar_type(), tensor.sizes().vec());
}

uint64_t ipuTensorId(const at::Tensor &tensor) {
  return getTensorDetails(*tensor.unsafeGetTensorImpl())->tensor_id;
}

uint64_t ipuTensorId(const at::TensorImpl &tensor) {
  return toIpuTensorImpl(tensor)->details->tensor_id;
}

bool isIpuTensor(const at::Tensor &tensor) {
  return tryIpuTensorImpl(tensor) != nullptr;
}

std::string str(const at::Tensor &tensor) {
  std::stringstream ss;
  ss << "impl_ " << reinterpret_cast<void *>(tensor.unsafeGetTensorImpl());
  if (!tensor.defined()) {
    ss << " type: <undefined>";
  } else {
    auto device_type = tensor.unsafeGetTensorImpl()->device_type();
    ss << " type " << device_type;
    if (device_type == at::DeviceType::IPU) {
      auto *ipu_tensor = toIpuTensorImpl(tensor);
      ss << " ID " << ipu_tensor->details->tensor_id;
    }
    ss << " sizes " << tensor.unsafeGetTensorImpl()->sizes();
    ss << " dtype " << tensor.unsafeGetTensorImpl()->dtype();
  }
  return ss.str();
}

uint64_t tensorDataSize(const at::Tensor &tensor) {
  return tensorImplDataSize(*tensor.unsafeGetTensorImpl());
}

Buffer &getHostBuffer(const at::Tensor &ipu_tensor) {
  return getHostBuffer(*toIpuTensorImpl(ipu_tensor));
}

Buffer &getHostBuffer(const at::TensorImpl &ipu_tensor) {
  auto details = toIpuTensorImpl(ipu_tensor)->details;
  return details->getBuffer();
}

bool hasData(const at::Tensor &ipu_tensor) {
  const auto &details = *toIpuTensorImpl(*toIpuTensorImpl(ipu_tensor))->details;
  return details.hasData();
}

void errorOnZeroSizedTensor(const at::Tensor &tensor) {
  auto sizes = tensor.sizes();
  if (std::any_of(sizes.begin(), sizes.end(),
                  [](auto dim) { return dim == 0; })) {
    std::stringstream err;
    err << "Zero-sized tensors are unsupported (Got shape [";
    for (std::size_t i = 0; i < sizes.size() - 1; i++) {
      err << sizes[i] << ", ";
    }
    err << sizes[sizes.size() - 1] << "]).";
    ERROR(err.str());
  }
}

TensorStore::TensorStore() : _ipu_session(poptorch_ir::createStaticSession()) {}

std::shared_ptr<IpuTensorDetails>
TensorStore::allocateTensorDetails(c10::IntArrayRef size,
                                   at::ScalarType coerced_scalar_type,
                                   std::shared_ptr<ITensorView> view_info) {
  for (size_t dim = 0; dim < size.size(); ++dim) {
    ERROR_ON_MSG(size.at(dim) < 0, "Invalid tensor shape: dimension "
                                       << dim << " is negative ("
                                       << size.at(dim) << ")");
  }

  auto details = std::make_shared<IpuTensorDetails>(
      _next_tensor_id++, getTensorType(coerced_scalar_type, size.vec()),
      std::move(view_info));

  return details;
}

at::Tensor TensorStore::allocateTensor(c10::IntArrayRef size,
                                       c10::optional<at::ScalarType> dtype,
                                       std::shared_ptr<ITensorView> view_info,
                                       c10::optional<at::Device> device) {
  const at::ScalarType scalar_type = scalarTypeOrDefault(dtype);
  auto coerced_scalar_type = coerceToSupportedType(scalar_type);
  auto details =
      allocateTensorDetails(size, coerced_scalar_type, std::move(view_info));
  auto strides = at::detail::defaultStrides(size);

  at::Tensor output = at::detail::make_tensor<IpuTensorImpl>(
      c10::scalarTypeToTypeMeta(coerced_scalar_type),
      deviceOrDefaultIpu(device), size, strides, std::move(details));

  for (size_t dim = 0; dim < size.size(); ++dim) {
    ERROR_ON_MSG(size.at(dim) < 0, "Invalid tensor shape: dimension "
                                       << dim << " is negative ("
                                       << size.at(dim) << ")");
  }

  ERROR_ON(output.device().type() != c10::DeviceType::IPU);

  logging::trace(
      "Created IPU tensor: id {} impl_ {} size {} strides {} dtype {}",
      ipuTensorId(output),
      reinterpret_cast<void *>(output.unsafeGetTensorImpl()), size, strides,
      coerced_scalar_type);

  if (scalar_type != coerced_scalar_type) {
    logging::warn("[DISPATCHER] Type coerced from {} to {} for tensor id {}",
                  scalar_type, coerced_scalar_type, ipuTensorId(output));
  }

  return output;
}

Buffer &TensorStore::allocateBuffer(IpuTensorDetails &details) {
  return details.getBuffer() = _ipu_session->allocate(details.type);
}

void TensorStore::allocateBuffer(const at::Tensor &ipu_tensor) {
  auto &details = *getTensorDetails(ipu_tensor);
  allocateBuffer(details);
}

void TensorStore::copyOnIpu(const at::Tensor &ipu_dest,
                            const at::Tensor &ipu_src) {
  ERROR_ON_MSG(ipu_dest.dtype() != ipu_src.dtype(),
               "Copy operations cannot cast outside of the dispatcher.");
  const auto &src_details = getTensorDetails(ipu_src);

  const auto &dest_details = getTensorDetails(ipu_dest);
  auto dest_buf = allocateBuffer(*dest_details);
  _ipu_session->copyDataOnDevice(dest_buf, src_details->getBuffer());

  ipu_dest.set_requires_grad(ipu_src.requires_grad());
}

void TensorStore::copyFromCpu(const at::Tensor &ipu_dest,
                              const at::Tensor &cpu_src) {
  logging::trace("[DISPATCHER] Copying from CPU tensor {} with data_ptr {}",
                 static_cast<void *>(cpu_src.unsafeGetTensorImpl()),
                 cpu_src.data_ptr());

  ERROR_ON(cpu_src.dtype() != ipu_dest.dtype());
  ERROR_ON(cpu_src.sizes() != ipu_dest.sizes());

  const auto &details = getTensorDetails(ipu_dest);

  auto &buff = allocateBuffer(*details);
  _ipu_session->copyDataFromCpuSource(
      buff, static_cast<const char *>(cpu_src.data_ptr()));

  ipu_dest.set_requires_grad(cpu_src.requires_grad());
}

void TensorStore::copyToCpu(const at::Tensor &cpu_dest,
                            const at::Tensor &ipu_src) {
  logging::trace("[DISPATCHER] Copying to CPU tensor {} with data_ptr {}",
                 static_cast<void *>(cpu_dest.unsafeGetTensorImpl()),
                 cpu_dest.data_ptr());

  ERROR_ON(ipu_src.dtype() != cpu_dest.dtype());
  ERROR_ON(ipu_src.sizes() != cpu_dest.sizes());

  const auto &details = getTensorDetails(ipu_src);

  _ipu_session->copyDataToCpu(static_cast<char *>(cpu_dest.data_ptr()),
                              details->getBuffer());
}

const std::shared_ptr<poptorch_ir::IIpuSession> &
TensorStore::getIpuSession() const {
  return _ipu_session;
}

void TensorStore::reset() { _ipu_session = nullptr; }

std::shared_ptr<IpuTensorDetails>
getTensorDetails(const at::Tensor &ipu_tensor) {
  return getTensorDetails(*ipu_tensor.unsafeGetTensorImpl());
}

std::vector<std::shared_ptr<IpuTensorDetails>>
getTensorDetails(const std::vector<at::Tensor> &ipu_tensors) {
  std::vector<std::shared_ptr<IpuTensorDetails>> details;
  details.reserve(ipu_tensors.size());
  std::transform(
      ipu_tensors.begin(), ipu_tensors.end(), std::back_inserter(details),
      [](const auto &ipu_tensor) { return getTensorDetails(ipu_tensor); });
  return details;
}

void setTensorDetails(const at::Tensor &ipu_tensor,
                      std::shared_ptr<IpuTensorDetails> details) {
  auto *impl = dynamic_cast<IpuTensorImpl *>(ipu_tensor.unsafeGetTensorImpl());
  ERROR_ON(impl == nullptr);
  impl->set_sizes_contiguous(details->type.shape);
  impl->details = std::move(details);
}

namespace {

IpuTensorDetails::Data getBufferOrView(std::shared_ptr<ITensorView> view_info) {
  if (view_info) {
    return view_info;
  }
  return std::make_shared<Buffer>();
}

} // namespace

IpuTensorDetails::IpuTensorDetails(IpuTensorId tensor_id_,
                                   poptorch_ir::TensorType type_,
                                   std::shared_ptr<ITensorView> view_info)
    : tensor_id(tensor_id_), type(std::move(type_)),
      data(getBufferOrView(std::move(view_info))) {}

Buffer &IpuTensorDetails::getBuffer() {
  return std::visit(
      Overloaded{[](const BufferPtr &buffer) -> Buffer & { return *buffer; },
                 [](const TensorViewPtr &view) -> Buffer & {
                   UNUSED(view);
                   ERROR("Cannot get the buffer of a view tensor.");
                 }},
      data);
}
std::shared_ptr<Buffer> IpuTensorDetails::getOwningBuffer() const {
  return std::visit(
      Overloaded{[](const BufferPtr &buffer) -> BufferPtr { return buffer; },
                 [](const TensorViewPtr &view) -> BufferPtr {
                   UNUSED(view);
                   return nullptr;
                 }},
      data);
}

bool IpuTensorDetails::hasData() const {
  return std::visit(
      Overloaded{[](const BufferPtr &buffer) { return buffer->hasData(); },
                 [](const TensorViewPtr &view) { return view != nullptr; }},
      data);
}

bool IpuTensorDetails::isView() const {
  return std::holds_alternative<TensorViewPtr>(data);
}

} // namespace poptorch


================================================
FILE: poptorch/source/dispatch_tracer/Tensor.hpp
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
#ifndef POPTORCH_DISPATCH_TENSOR_HPP_
#define POPTORCH_DISPATCH_TENSOR_HPP_

#include <torch/csrc/jit/ir/ir.h>

#include <algorithm>
#include <iterator>
#include <memory>
#include <string>
#include <variant>
#include <vector>

#include "pytorch_bridge/CompilerTypes.hpp"
#include "pytorch_bridge/DebugInfo.hpp"
#include "pytorch_bridge/IpuSession.hpp"

namespace poptorch_ir {
class IIpuSession;
}

namespace poptorch {

using Buffer = poptorch_ir::Buffer;
using IpuTensorId = uint64_t;

class IDispatch;
struct IpuTensorImpl;
class ValueMapper;

class ITensorView {
public:
  virtual poptorch_ir::TensorId addViewToGraph(IDispatch &dispatcher) = 0;
};

// The ipu tensor details tracks the data and meta information associated with
// the IpuTensorImpl. This information cannot be directly stored in the ipu
// tensor impl because the lifetime of that is too short when views are
// involved. We need to the lifetime of the data to outlive any views of the
// data.
struct IpuTensorDetails {
  IpuTensorDetails(IpuTensorId tensor_id_, poptorch_ir::TensorType type_,
                   std::shared_ptr<ITensorView> view_info);

  // The tensor details either owns its own storage or is a view of other tensor
  // details.
  //
  // For inputs that are temporaries we need the buffer to live until the
  // function is ran and we don't want to extend the lifetime of the
  // IpuTensorDetails unnecessarily. This means we need to share ownership of
  // the buffer.
  using Data =
      std::variant<std::shared_ptr<Buffer>, std::shared_ptr<ITensorView>>;

  const IpuTensorId tensor_id;
  const poptorch_ir::TensorType type;

  Data data;

  poptorch_ir::TensorDebugInfo debug_info;

  Buffer &getBuffer();
  std::shared_ptr<Buffer> getOwningBuffer() const;

  bool hasData() const;
  bool isView() const;
};

poptorch_ir::Type toCompilerType(const at::ScalarType &elem_type);
poptorch_ir::Type toCompilerElementType(const at::Tensor &tensor);
poptorch_ir::TensorType getTensorType(const at::Tensor &tensor);

uint64_t tensorImplDataSize(const at::TensorImpl &impl);

// Return the data size in bytes of the given at::Tensor.
uint64_t tensorDataSize(const at::Tensor &tensor);

// Return the tensor ID of the given IPU tensor.
IpuTensorId ipuTensorId(const at::Tensor &tensor);

// Return the tensor ID of the given IPU tensor implementation.
IpuTensorId ipuTensorId(const at::TensorImpl &tensor);

// Return true if the given at::Tensor is an IPU tensor.
bool isIpuTensor(const at::Tensor &tensor);

// Return a string containing the given tensor's metadata (device, shape, etc).
std::string str(const at::Tensor &tensor);

// Returns a reference to the CPU buffer of the given IPU tensor.
Buffer &getHostBuffer(const at::Tensor &ipu_tensor);

// Returns a reference to the CPU buffer of the given IPU tensor implementation.
Buffer &getHostBuffer(const at::TensorImpl &ipu_tensor);

bool hasData(const at::Tensor &ipu_tensor);

std::shared_ptr<IpuTensorDetails>
getTensorDetails(const at::Tensor &ipu_tensor);

std::vector<std::shared_ptr<IpuTensorDetails>>
getTensorDetails(const std::vector<at::Tensor> &ipu_tensors);

void setTensorDetails(const at::Tensor &ipu_tensor,
                      std::shared_ptr<IpuTensorDetails> details);

void errorOnZeroSizedTensor(const at::Tensor &tensor);

/** Host-side storage for `ipu` tensors.
 *
 *  This allows the user to convert tensors and modules to `ipu` using
 *  `t.to("ipu")` even when the dispatcher is off, and even outside eager mode.
 *
 *  We simply copy the tensor in to our ownership, then when we go to load and
 *  execute an executable, we can upload these tensors to the device. We'll
 *  also retrieve them from the device when the user copies a tensor back to the
 *  CPU (`t.to("cpu")`).
 */
class TensorStore {
public:
  TensorStore();
  TensorStore(const TensorStore &) = delete;
  TensorStore(TensorStore &&) = delete;
  TensorStore &operator=(TensorStore &) = delete;
  TensorStore &operator=(TensorStore &&) = delete;

  std::shared_ptr<IpuTensorDetails>
  allocateTensorDetails(c10::IntArrayRef size,
                        at::ScalarType coerced_scalar_type,
                        std::shared_ptr<ITensorView> view_info);
  // Create a new IPU tensor.
  at::Tensor allocateTensor(c10::IntArrayRef sizes,
                            c10::optional<at::ScalarType> dtype = c10::nullopt,
                            std::shared_ptr<ITensorView> view_info = nullptr,
                            c10::optional<at::Device> device = c10::nullopt);

  void allocateBuffer(const at::Tensor &ipu_tensor);

  void copyOnIpu(const at::Tensor &ipu_dest, const at::Tensor &ipu_src);
  void copyFromCpu(const at::Tensor &ipu_dest, const at::Tensor &cpu_src);
  void copyToCpu(const at::Tensor &cpu_dest, const at::Tensor &ipu_src);

  const std::shared_ptr<poptorch_ir::IIpuSession> &getIpuSession() const;

  void reset();

private:
  Buffer &allocateBuffer(IpuTensorDetails &details);

  poptorch_ir::TensorId _next_tensor_id{1};
  std::shared_ptr<poptorch_ir::IIpuSession> _ipu_session =
      poptorch_ir::createStaticSession();
};

} // namespace poptorch

#endif // POPTORCH_DISPATCH_TENSOR_HPP_


================================================
FILE: poptorch/source/dispatch_tracer/TypeInferenceHandler.cpp
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
#include <ATen/MetaFunctions.h>
#include <algorithm>

#include "poptorch_logging/Logging.hpp"

#include "CommonHelperFunctions.hpp"
#include "TypeInferenceHandler.hpp"
#include <c10/util/intrusive_ptr.h>

namespace poptorch {

constexpr c10::DispatchKeySet meta_keys{c10::DispatchKey::Meta,
                                        c10::DispatchKey::AutogradMeta};

namespace {

c10::Stack copyTensorsFrom(const c10::Stack &meta_stack) {
  c10::Stack tmp_stack;
  tmp_stack.reserve(meta_stack.size());
  std::copy_if(meta_stack.cbegin(), meta_stack.cend(),
               std::back_inserter(tmp_stack), [](const auto &value) {
                 return value.isTensor() || value.isTensorList();
               });
  return tmp_stack;
}

} // namespace

void TypeInferenceHandler::inferOutputTypes(const c10::OperatorHandle &op,
                                            c10::Stack *ipu_stack) {
  const auto schema_key = getSchemaKey(op.schema());
  ERROR_ON_MSG(!op.hasComputedKernelForDispatchKey(c10::DispatchKey::Meta),
               "Type inference failed for "
                   << schema_key
                   << " because the operator "
                      "doesn't have an implementation for the Meta backend.");

  // Unfortunately, aten::prelu with 1D inputs is broken with the Meta backend:
  // https://github.com/pytorch/pytorch/issues/89560
  // As a workaround, we add a dummy channel dim to the input, and then remove
  // it again afterwards
  const bool is_prelu =
      schema_key == "aten::prelu" || schema_key == "aten::_prelu_kernel";
  // Create a new operand stack with meta tensors
  c10::Stack meta_stack = createMetaStack(*ipu_stack, schema_key, is_prelu);

  // redispatchBoxed drops all function inputs from the stack. Meta stack is
  // the only owner of input params created by createMetaStack. If function
  // returns reference to input param, dropping params cause memory leak.
  // In order to prevent it lifetime of inputs must be extedned.
  const c10::Stack input_tensor_liftetime_extender =
      copyTensorsFrom(meta_stack);

  logging::trace("[DISPATCHER] Using meta type inference for {}", schema_key);

  op.redispatchBoxed(meta_keys, &meta_stack);
  ipu_stack->clear();

  repopulateIpuStack(*ipu_stack, meta_stack, is_prelu);
}

std::optional<TypeInferenceHandler::Workaround>
TypeInferenceHandler::workaroundLookup(const std::string &schema_key) {
  if (const auto &it = schema_to_workaround.find(schema_key);
      it != schema_to_workaround.cend()) {
    return it->second;
  }

  return std::nullopt;
}

c10::IValue TypeInferenceHandler::applyWorkaround(
    const TypeInferenceHandler::Workaround &workaround, std::size_t value_index,
    const c10::IValue &value, const c10::Stack &stack) {

  if (workaround.predicate_fn(value_index, value, stack)) {
    return workaround.transform_fn(value, stack);
  }

  return value;
}

namespace {

template <typename T>
c10::List<T> createMetaTensorList(const c10::List<T> &ipu_tensor_list,
                                  bool should_upcast_to_long) {
  c10::List<T> meta_tensor_list;
  std::function<T(const T &)> transform_fn;

  if constexpr (std::is_same_v<c10::optional<at::Tensor>, T>) {
    transform_fn = [=](const T &t) -> T {
      if (!t.has_value()) {
        return c10::nullopt;
      }
      return TypeInferenceHandler::toMeta(t.value(), should_upcast_to_long);
    };
  } else {
    transform_fn = [=](const T &t) -> T {
      return TypeInferenceHandler::toMeta(t, should_upcast_to_long);
    };
  }

  std::transform(ipu_tensor_list.begin(), ipu_tensor_list.end(),
                 std::back_inserter(meta_tensor_list), transform_fn);

  return meta_tensor_list;
}

c10::Device createMetaDevice(const c10::Device &device) {
  return device.is_ipu() ? c10::Device{at::kMeta} : device;
}

bool isUpcastRequired(const std::string &schema_key,
                      const std::size_t input_idx) {
  if (auto opt_upcast_arg =
          TypeInferenceHandler::indexArgToUpcast(schema_key)) {
    return opt_upcast_arg.value() == input_idx;
  }

  return false;
}

} // namespace

c10::Stack TypeInferenceHandler::createMetaStack(const c10::Stack &ipu_stack,
                                                 const std::string &schema_key,
                                                 bool is_prelu) {
  c10::Stack meta_stack;
  meta_stack.reserve(ipu_stack.size());
  const auto maybe_workaround = workaroundLookup(schema_key);

  std::transform(
      ipu_stack.cbegin(), ipu_stack.cend(), std::back_inserter(meta_stack),
      [&, input_idx = 0u](const c10::IValue &value) mutable -> c10::IValue {
        // For various reasons, sometimes we have to transform the value before
        // pushing it on the meta stack to workaround validation issues which
        // are not the problem for the PopArt backend.
        const auto &v = maybe_workaround
                            ? applyWorkaround(maybe_workaround.value(),
                                              input_idx, value, ipu_stack)
                            : value;

        // We coerce index tensor types from Long to Int during dispatch, but
        // these need to be converted back to Long before running with the Meta
        // backend otherwise they'll emit type errors
        const bool should_upcast_to_long =
            isUpcastRequired(schema_key, input_idx);
        const bool is_first_input = input_idx == 0;

        ++input_idx;
        // Convert any IPU tensors to meta tensors
        if (v.isTensor()) {
          return toMeta(v.toTensor(), should_upcast_to_long,
                        is_prelu && is_first_input);
        }
        if (v.isTensorList()) {
          return createMetaTensorList(v.toTensorList(), should_upcast_to_long);
        }
        if (v.isOptionalTensorList()) {
          return createMetaTensorList(v.toOptionalTensorList(),
                                      should_upcast_to_long);
        }
        if (v.isDevice()) {
          return createMetaDevice(v.toDevice());
        }

        return v;
      });

  return meta_stack;
}

at::Tensor TypeInferenceHandler::allocateTensor(const at::Tensor &meta_tensor,
                                                bool is_prelu) {
  auto sizes = meta_tensor.sizes();
  if (is_prelu && sizes.size() == 2 && sizes[1] == 1) {
    sizes = sizes.slice(1);
  }

  return _tensor_store->allocateTensor(sizes, meta_tensor.scalar_type());
}

c10::List<at::Tensor> TypeInferenceHandler::allocateTensorList(
    const c10::List<at::Tensor> &meta_tensor_list) {
  c10::List<at::Tensor> allocated_tensor_list;

  std::transform(meta_tensor_list.begin(), meta_tensor_list.end(),
                 std::back_inserter(allocated_tensor_list),
                 [this](const at::Tensor &tensor) {
                   return this->_tensor_store->allocateTensor(
                       tensor.sizes(), tensor.scalar_type());
                 });

  return allocated_tensor_list;
}

void TypeInferenceHandler::repopulateIpuStack(c10::Stack &ipu_stack,
                                              const c10::Stack &meta_stack,
                                              bool is_prelu) {
  ERROR_ON(!ipu_stack.empty());
  ipu_stack.reserve(meta_stack.size());

  std::transform(meta_stack.cbegin(), meta_stack.cend(),
                 std::back_inserter(ipu_stack),
                 [=](const auto &v) -> c10::IValue {
                   if (v.isTensor()) {
                     return allocateTensor(v.toTensor(), is_prelu);
                   }
                   if (v.isTensorList()) {
                     return allocateTensorList(v.toTensorList());
                   }
                   return v;
                 });
}

namespace {

std::vector<int64_t> getMetaTensorSize(const at::Tensor &tensor,
                                       bool is_prelu) {
  std::vector<int64_t> sizes = tensor.sizes().vec();
  if (is_prelu && sizes.size() == 1) {
    sizes.push_back(1);
  }

  return sizes;
}

c10::ScalarType getMetaTensorDtype(const at::Tensor &tensor,
                                   bool should_upcast_to_long) {
  const auto dtype = tensor.scalar_type();
  if (dtype == c10::ScalarType::Int && should_upcast_to_long) {
    return c10::ScalarType::Long;
  }

  return dtype;
}

at::Tensor createEmptyMetaTensor(const at::Tensor &tensor,
                                 bool should_upcast_to_long, bool is_prelu) {
  const auto dtype = getMetaTensorDtype(tensor, should_upcast_to_long);
  const std::vector<long> sizes = getMetaTensorSize(tensor, is_prelu);

  auto out = at::meta::empty(sizes, dtype);

  if (tensor.unsafeGetTensorImpl()->is_wrapped_number()) {
    out.unsafeGetTensorImpl()->set_wrapped_number(true);
  }

  return out;
}

} // namespace

at::Tensor TypeInferenceHandler::toMeta(const at::Tensor &tensor,
                                        bool should_upcast_to_long,
                                        bool is_prelu) {
  if (!tensor.defined()) {
    return tensor;
  }
  if (!isIpuTensor(tensor)) {
    if (tensor.unsafeGetTensorImpl()->is_wrapped_number()) {
      return tensor;
    }
    ERROR("Expected an IPU tensor but got tensor(device="
          << tensor.device() << ", shape=" << tensor.sizes()
          << ", dtype=" << tensor.scalar_type()
          << ").\nConstant tensors should be moved explicitly "
             "to the IPU, via cpu_tensor.to(\"ipu\").");
  }

  return createEmptyMetaTensor(tensor, should_upcast_to_long, is_prelu);
}

c10::optional<std::size_t>
TypeInferenceHandler::indexArgToUpcast(const std::string &schema_key) {

  if (schema_key == "aten::argmax.out" || schema_key == "aten::argmin.out") {
    return 3;
  }
  if (schema_key == "aten::gather" || schema_key == "aten::scatter.src" ||
      schema_key == "aten::scatter_.src" ||
      schema_key == "aten::scatter.value" ||
      schema_key == "aten::scatter.value_reduce" ||
      schema_key == "aten::scatter_.value" ||
      schema_key == "aten::scatter_.value_reduce" ||
      schema_key == "aten::scatter_add" || schema_key == "aten::scatter_add_" ||
      schema_key == "aten::scatter_reduce.two" ||
      schema_key == "aten::scatter_reduce_.two" ||
      schema_key == "torch_scatter::scatter_max" ||
      schema_key == "torch_scatter::scatter_min" ||
      schema_key == "torch_scatter::scatter_mul" ||
      schema_key == "torch_spline_conv::spline_basis") {
    return 2;
  }
  if (schema_key == "aten::index.Tensor" ||
      schema_key == "aten::nll_loss_forward" ||
      schema_key == "aten::take_along_dim" ||
      schema_key == "aten::take_along_dim.out") {
    return 1;
  }
  if (schema_key == "aten::sort.values_stable") {
    return 5;
  }
  return c10::nullopt;
}

static bool reductionWorkaroundPredicate(const std::size_t value_index,
                                         const c10::IValue &value,
                                         const c10::Stack &ipu_stack,
                                         const std::size_t dtype_index,
                                         const std::size_t out_index) {

  return value_index == dtype_index && value.isNone() &&
         !ipu_stack.at(out_index).isNone();
}

static c10::IValue reductionTransform(const c10::IValue &transformed_value,
                                      const c10::Stack &ipu_stack,
                                      const std::size_t out_index) {
  const auto &value = ipu_stack.at(out_index);
  if (!value.isNone() && value.isTensor()) {
    const auto tensor = value.toTensor();
    return c10::IValue(c10::typeMetaToScalarType(tensor.dtype()));
  }
  return transformed_value;
}

static auto makeReductionWorkaround(const std::size_t dtype_index,
                                    const std::size_t out_index) {

  /* In case dtype is None, PyTorch meta backend assumes that it is int64_t for
   * all integral tensors, causing validation issues when the output tensor has
   * int32_t dtype.
   */

  const auto predicate = [=](const std::size_t value_index,
                             const c10::IValue &value,
                             const c10::Stack &ipu_stack) {
    return reductionWorkaroundPredicate(value_index, value, ipu_stack,
                                        dtype_index, out_index);
  };

  const auto transform_fn = [=](const c10::IValue &transformed_value,
                                const c10::Stack &ipu_stack) {
    return reductionTransform(transformed_value, ipu_stack, out_index);
  };

  return TypeInferenceHandler::Workaround{predicate, transform_fn};
}
const std::unordered_map<std::string, TypeInferenceHandler::Workaround>
    TypeInferenceHandler::schema_to_workaround = {
        {"aten::sum.IntList_out",
         makeReductionWorkaround(3 /*dtype_index*/, 4 /*out_index*/)},
        {"aten::cumsum.out",
         makeReductionWorkaround(2 /*dtype_index*/, 3 /*out_index*/)},
        {"aten::cumprod.out",
         makeReductionWorkaround(2 /*dtype_index*/, 3 /*out_index*/)},
        {"aten::sum.out",
         makeReductionWorkaround(4 /*dtype_index*/, 0 /*out_index*/)},
        {"aten::prod.out",
         makeReductionWorkaround(4 /*dtype_index*/, 0 /*out_index*/)}};
} // namespace poptorch


================================================
FILE: poptorch/source/dispatch_tracer/TypeInferenceHandler.hpp
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
#ifndef POPTORCH_DISPATCH_TYPE_INFERENCE_HANDLER_HPP_
#define POPTORCH_DISPATCH_TYPE_INFERENCE_HANDLER_HPP_

#include <functional>
#include <optional>
#include <unordered_map>

#include <ATen/Tensor.h>
#include <ATen/core/boxing/KernelFunction.h>

#include "Tensor.hpp"

namespace poptorch {

class TypeInferenceHandler {
public:
  explicit TypeInferenceHandler(TensorStore *tensor_store)
      : _tensor_store(tensor_store) {}

  void inferOutputTypes(const c10::OperatorHandle &op, c10::Stack *ipu_stack);

  struct Workaround {
    std::function<bool(std::size_t, const c10::IValue &, const c10::Stack &)>
        predicate_fn;
    std::function<c10::IValue(const c10::IValue &, const c10::Stack &)>
        transform_fn;
  };

  // Create a meta tensor with the same type as the input
  static at::Tensor toMeta(const at::Tensor &tensor, bool upcast_to_long,
                           bool is_prelu = false);
  static c10::optional<std::size_t>
  indexArgToUpcast(const std::string &schema_key);

private:
  // Create a stack of meta tensors that matches the inputs in
  // ipu_stack
  static c10::Stack createMetaStack(const c10::Stack &ipu_stack,
                                    const std::string &schema_key,
                                    bool is_prelu);

  // Using the computed meta output stack, repopulate the ipu stack
  // with tensors of the correct inferred output types
  void repopulateIpuStack(c10::Stack &ipu_stack, const c10::Stack &meta_stack,
                          bool is_prelu);

  at::Tensor allocateTensor(const at::Tensor &meta_tensor, bool is_prelu);
  c10::List<at::Tensor>
  allocateTensorList(const c10::List<at::Tensor> &meta_tensor_list);

  static std::optional<Workaround>
  workaroundLookup(const std::string &schema_key);
  static c10::IValue applyWorkaround(const Workaround &workaround,
                                     std::size_t value_index,
                                     const c10::IValue &value,
                                     const c10::Stack &stack);
  static const std::unordered_map<std::string, Workaround> schema_to_workaround;

  TensorStore *_tensor_store;
};
} // namespace poptorch

#endif // POPTORCH_DISPATCH_TYPE_INFERENCE_HANDLER_HPP_


================================================
FILE: poptorch/source/dispatch_tracer/ValueMapper.cpp
================================================
// Copyright (c) 2021 Graphcore Ltd. All rights reserved.
#include "ValueMapper.hpp"

#include <memory>
#include <utility>
#include <variant>

#include "Tensor.hpp"
#include "poptorch/DispatchTracer.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"
#include "pytorch_bridge/IpuSession.hpp"

namespace poptorch {

ValueMapper::ValueMapper(ValueMapper &&other) noexcept = default;
ValueMapper &ValueMapper::operator=(ValueMapper &&other) noexcept = default;
ValueMapper::~ValueMapper() = default;

ValueMapper::TrackedTensor::TrackedTensor(
    const std::shared_ptr<IpuTensorDetails> &details)
    : tensor_details(details), buffer(details->getOwningBuffer()) {}

bool ValueMapper::isParameter(const at::Tensor &t) const {
  if (const auto *record = find(t)) {
    return record->is_parameter;
  }
  return false;
}

void ValueMapper::setParameterName(const at::Tensor &t,
                                   const std::string &name) {
  const IpuTensorId id = ipuTensorId(t);
  const auto itr = _tensors.find(id);

  if (itr == _tensors.end()) {
    logging::warn("Parameter {} cannot be named because it was not added to "
                  "the value mapper.",
                  name);
    return;
  }

  if (!itr->second.is_parameter && !t.is_floating_point()) {
    logging::warn("Parameter {}: {} was downgraded to constant because PopART "
                  "doesn't support non floating point parameters",
                  name, str(t));
    return;
  }

  ERROR_ON_MSG(!itr->second.is_parameter,
               "Not a parameter or a buffer: " << str(t));
  auto name_it = _name_ids_map.find(name);
  if (name_it != _name_ids_map.end()) {
    ERROR_ON_MSG(name_it->second != id,
                 "Name " << name << " can't be associated to " << id
                         << " because it is already associated to "
                         << name_it->second);
    return;
  }
  auto id_it = _ids_name_map.find(id);
  if (id_it != _ids_name_map.end()) {
    ERROR_ON_MSG(id_it->second != name, "Name for tensor id "
                                            << id << " can't be set to " << name
                                            << " because it is already set to "
                                            << id_it->second);
    return;
  }

  _name_ids_map.insert({name, id});
  _ids_name_map.insert({id, name});
}

std::string ValueMapper::getParameterName(torch::jit::Value *value) const {
  auto itr = _values_map.find(value);
  if (itr == _values_map.end()) {
    logging::trace("JIT value not tracked {}", reinterpret_cast<void *>(value));
    return "";
  }
  auto it = _ids_name_map.find(itr->second);
  if (it == _ids_name_map.end()) {
    return "";
  }
  return it->second;
}

void ValueMapper::setParameterPerReplica(const std::string &param_name,
                                         const at::Tensor &tensor,
                                         int comm_group_type, int shards,
                                         int variable_retrieval_mode) {
  auto param_it = _name_ids_map.find(param_name);
  if (param_it == std::end(_name_ids_map)) {
    logging::warn("Parameter name {} was not found", param_name);
    return;
  }
  auto data_size = tensorDataSize(tensor);
  ERROR_ON_MSG(!tensor.is_contiguous(),
               "Data source must be contiguous: " << str(tensor));
  const PerReplicaSettings settings = {
      comm_group_type, shards, variable_retrieval_mode, tensor.size(0),
      std::make_shared<std::vector<char>>(data_size)};
  memcpy(settings.host_buffer->data(), tensor.data_ptr(), data_size);
  _per_replica_map[param_it->second] = settings;
}
std::optional<PerReplicaSettings>
ValueMapper::getParameterPerReplica(torch::jit::Value *value) const {
  auto itr = _values_map.find(value);
  if (itr == _values_map.end()) {
    logging::trace("JIT value not tracked {}", reinterpret_cast<void *>(value));
    return std::nullopt;
  }
  auto it = _per_replica_map.find(itr->second);
  if (it == _per_replica_map.end()) {
    return std::nullopt;
  }
  return it->second;
}

// Add a tensor to the IR.
void ValueMapper::addTensor(const std::shared_ptr<IpuTensorDetails> &details,
                            poptorch_ir::TensorId mlir_id, bool is_param) {
  logging::trace("Adding {} to value mapper {}, MLIR id: {}",
                 details->tensor_id, static_cast<void *>(this), mlir_id);

  auto tensor_id = details->tensor_id;
  auto &record =
      _tensors.insert({tensor_id, TrackedTensor{details}}).first->second;
  record.mlir = mlir_id;
  record.is_parameter |= is_param;

  _mlir_id_tensors_map.emplace(mlir_id, tensor_id);
}

void ValueMapper::addTensor(const at::Tensor &t, poptorch_ir::TensorId mlir_id,
                            bool is_param) {
  addTensor(getTensorDetails(t), mlir_id, is_param);
}

void ValueMapper::addTensorUnchecked(const at::Tensor &t,
                                     torch::jit::Value *val, bool is_param) {
  logging::trace("Adding {} to value mapper {}, JIT ir: {}",
                 static_cast<void *>(t.unsafeGetTensorImpl()),
                 static_cast<void *>(this), val->debugName());

  // If the tensor is already being tracked then we will update the JIT
  // value being tracked. Otherwise we insert and add the jit value.
  const auto &new_details = getTensorDetails(t);

  const auto ipu_tensor_id = new_details->tensor_id;
  auto &record = _tensors.insert({ipu_tensor_id, TrackedTensor{new_details}})
                     .first->second;
  record.jit = val;
  record.is_parameter |= is_param;

  // Ensure we maintain a lookup of torch::jit to pytorch tensor.
  _values_map.insert({val, ipu_tensor_id});
}
void ValueMapper::addTensor(const at::Tensor &t, torch::jit::Value *val,
                            bool is_param) {
  ERROR_ON_MSG(val == nullptr, "torch::jit::Value* cannot be null");
  validateTensorShapeAndType(val, t);

  addTensorUnchecked(t, val, is_param);
}

ValueMapper::TrackedTensor *ValueMapper::rawTensorRecord(const at::Tensor &t) {
  return find(t);
}

ValueMapper::TrackedTensor *
ValueMapper::rawTensorRecord(torch::jit::Value *val) {
  auto itr = _values_map.find(val);
  if (itr == _values_map.end()) {
    return nullptr;
  }
  auto tracked_tensor_itr = _tensors.find(itr->second);
  if (tracked_tensor_itr == _tensors.end()) {
    return nullptr;
  }
  return &tracked_tensor_itr->second;
}

// Get the user tensor from our SSA tensors.
torch::jit::Value *ValueMapper::getValueForTensor(const at::Tensor &t) {
  if (!isIpuTensor(t)) {
    return nullptr;
  }

  if (auto *tracked_tensor = find(t)) {
    return tracked_tensor->jit;
  }

  return nullptr;
}

poptorch_ir::TensorId
ValueMapper::getMLIRForTensorId(IpuTensorId tensor_id) const {
  if (const auto itr = _tensors.find(tensor_id); itr != _tensors.end()) {
    return itr->second.mlir;
  }

  return poptorch_ir::tensor_error_id;
}
poptorch_ir::TensorId
ValueMapper::getMLIRForTensor(const IpuTensorDetails &details) const {
  if (const auto *tracked_tensor = find(details)) {
    return tracked_tensor->mlir;
  }

  return poptorch_ir::tensor_error_id;
}

poptorch_ir::TensorId ValueMapper::getMLIRForTensor(const at::Tensor &t) const {
  if (!isIpuTensor(t)) {
    return poptorch_ir::tensor_error_id;
  }

  return getMLIRForTensor(*getTensorDetails(t));
}

bool ValueMapper::hasMapping(const at::Tensor &t) const {
  return find(t) != nullptr;
}

void ValueMapper::addTensorList(const TensorList &list,
                                torch::jit::Value *val) {
  logging::trace("Adding tensor list to value mapper, JIT ir: {}",
                 val->debugName());
  _tensor_lists.insert({list, val});
}

torch::jit::Value *ValueMapper::getValueForTensorList(const TensorList &list) {
  auto itr = _tensor_lists.find(list);
  if (itr != _tensor_lists.end()) {
    return itr->second;
  }
  return nullptr;
}

void ValueMapper::replaceValue(torch::jit::Value *v_old,
                               torch::jit::Value *v_new) {
  for (auto &rec : _tensors) {
    if (rec.second.jit == v_old) {
      rec.second.jit = v_new;
    }
  }
}

std::shared_ptr<IpuTensorDetails>
ValueMapper::getTensorDetailsForId(IpuTensorId id) const {
  auto it = _tensors.find(id);
  if (it == _tensors.end()) {
    return nullptr;
  }

  return it->second.tensor_details.lock();
}

std::shared_ptr<IpuTensorDetails>
ValueMapper::getTensorDetailsForMlirId(poptorch_ir::TensorId id) const {
  auto it = _mlir_id_tensors_map.find(id);
  if (it == _mlir_id_tensors_map.end()) {
    return nullptr;
  }

  return getTensorDetailsForId(it->second);
}

Buffer ValueMapper::getBufferForId(IpuTensorId id) const {
  const auto it = _tensors.find(id);
  if (it == _tensors.end()) {
    return Buffer();
  }

  return *it->second.buffer;
}

poptorch_ir::CpuBuffer
ValueMapper::getBufferForValue(torch::jit::Value *value) const {
  auto itr = _values_map.find(value);
  if (itr == _values_map.end()) {
    return nullptr;
  }

  if (auto b = getBufferForId(itr->second); b.hasData()) {
    return b.getCpuData();
  }

  return nullptr;
}

ValueMapper::TrackedTensor *ValueMapper::find(const IpuTensorDetails &details) {
  auto itr = _tensors.find(details.tensor_id);
  if (itr == _tensors.end()) {
    return nullptr;
  }
  return &itr->second;
}
const ValueMapper::TrackedTensor *
ValueMapper::find(const IpuTensorDetails &details) const {
  return const_cast<ValueMapper *>(this)->find(details);
}

ValueMapper::TrackedTensor *ValueMapper::find(const at::Tensor &t) {
  return find(*getTensorDetails(t));
}
const ValueMapper::TrackedTensor *ValueMapper::find(const at::Tensor &t) const {
  return find(*getTensorDetails(t));
}
} // namespace poptorch


================================================
FILE: poptorch/source/dispatch_tracer/ValueMapper.hpp
================================================
// Copyright (c) 2021 Graphcore Ltd. All rights reserved.
#ifndef POPTORCH_DISPATCH_VALUE_MAPPER_HPP_
#define POPTORCH_DISPATCH_VALUE_MAPPER_HPP_

#include <torch/csrc/jit/ir/ir.h>

#include <functional>
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>

#include "Tensor.hpp"
#include "poptorch/DispatchTracer.hpp"
#include "pytorch_bridge/CompilerTypes.hpp"

namespace poptorch {

/*
 * The value mapper is the core of the tracer functionality. It provides the
 * system by which we map an incoming at::Tensor onto the compiler IRs. We take
 * a tensor and disambiguate it into a torch::jit::Value or poptorch compiler
 * TensorID corresponding to the values we are tracking for that tensor in the
 * JIT/MLIR graphs respectively.
 */
class ValueMapper {
private:
  using TensorList = std::vector<torch::jit::Value *>;

  // Hash combine for mapping a vector of jit values (inputs of a
  // prim::ListConstruct) to the output jit value. This allows us to use an
  // unordered_map from TensorList to the output values and thus track the
  // incoming tensor lists. Performance and collisions are not very critical
  // in this scenario as we don't expect models with unreasonably
  // large number of lists.
  struct TensorListHash {
    size_t operator()(const TensorList &list) const {
      const std::hash<const torch::jit::Value *> hash_func;
      size_t hash = 11;
      for (const auto *value : list) {
        const size_t hash_next = hash_func(value);
        hash = hash * 31 + hash_next;
      }
      return hash;
    }
  };

public:
  ValueMapper() = default;

  ValueMapper(ValueMapper &&) noexcept;
  ValueMapper &operator=(ValueMapper &&) noexcept;
  ValueMapper(const ValueMapper &) = delete;
  ValueMapper &operator=(const ValueMapper &) = delete;

  ~ValueMapper();

  // Each tensor we are tracking has a short record containing a pointer to the
  // tensor and its corresponding values in the two IRs.
  struct TrackedTensor {
    explicit TrackedTensor(const std::shared_ptr<IpuTensorDetails> &details);

    // The underlying tensor information. Note that we don't participate in
    // ownership here. We want to tie the lifetime of the tensor details to the
    // when the tensor is accessible from pytorch. Note that it isn't sufficient
    // to check whether the tensor is directly accessible from pytorch because
    // the tensor details might be kept alive at the end of a chain of view
    // tensors
    std::weak_ptr<IpuTensorDetails> tensor_details;

    bool is_parameter = false;

    // We want to track the lifetime of the tensor_details and the buffer
    // separately. This is so we can get the data from inputs to the graph that
    // are temporaries without extending their lifetime
    std::shared_ptr<Buffer> buffer;

    // The value in JIT IR
    torch::jit::Value *jit = nullptr;

    // The value in our mlir backend.
    poptorch_ir::TensorId mlir = poptorch_ir::tensor_error_id;
  };

  TrackedTensor *rawTensorRecord(const at::Tensor &t);

  TrackedTensor *rawTensorRecord(torch::jit::Value *val);

  torch::jit::Value *getValueForTensor(const at::Tensor &t);

  poptorch_ir::TensorId getMLIRForTensorId(IpuTensorId tensor_id) const;
  poptorch_ir::TensorId getMLIRForTensor(const IpuTensorDetails &details) const;
  poptorch_ir::TensorId getMLIRForTensor(const at::Tensor &t) const;

  void addTensorUnchecked(const at::Tensor &t, torch::jit::Value *val,
                          bool is_param);
  void addTensor(const at::Tensor &t, torch::jit::Value *val, bool is_param);

  void addTensor(const std::shared_ptr<IpuTensorDetails> &details,
                 poptorch_ir::TensorId mlir_id, bool is_param);
  void addTensor(const at::Tensor &t, poptorch_ir::TensorId mlir_id,
                 bool is_param);

  void addTensorList(const TensorList &list, torch::jit::Value *val);

  torch::jit::Value *getValueForTensorList(const TensorList &list);

  bool isParameter(const at::Tensor &t) const;

  void setParameterName(const at::Tensor &t, const std::string &name);
  std::string getParameterName(torch::jit::Value *value) const;

  void setParameterPerReplica(const std::string &param_name,
                              const at::Tensor &tensor, int comm_group_type,
                              int shards, int variable_retrieval_mode);
  std::optional<PerReplicaSettings>
  getParameterPerReplica(torch::jit::Value *value) const;

  void replaceValue(torch::jit::Value *v_old, torch::jit::Value *v_new);

  std::shared_ptr<IpuTensorDetails> getTensorDetailsForId(IpuTensorId id) const;
  std::shared_ptr<IpuTensorDetails>
  getTensorDetailsForMlirId(poptorch_ir::TensorId mlir_id) const;

  Buffer getBufferForId(IpuTensorId id) const;
  poptorch_ir::CpuBuffer getBufferForValue(torch::jit::Value *value) const;

  bool hasMapping(const at::Tensor &t) const;

private:
  // We map each PyTorch tensor to a record of all the metadata we are tracking
  // about that tensor in the tensor map.
  std::unordered_map<IpuTensorId, TrackedTensor> _tensors;

  // Mapping between parameter / buffer names and tensor IDs
  std::unordered_map<std::string, IpuTensorId> _name_ids_map;
  std::unordered_map<IpuTensorId, std::string> _ids_name_map;

  std::unordered_map<IpuTensorId, PerReplicaSettings> _per_replica_map;

  // We also need to map the values to the mlir so we can query the mlir for a
  // given value.
  std::unordered_map<torch::jit::Value *, IpuTensorId> _values_map;

  // Map each prim::ListConstruct to a corresponding jit output value.
  std::unordered_map<TensorList, torch::jit::Value *, TensorListHash>
      _tensor_lists;

  // For resolving aliases, it's useful to find a TrackedTensor from its id.
  std::unordered_map<poptorch_ir::TensorId, IpuTensorId> _mlir_id_tensors_map;

  TrackedTensor *find(const IpuTensorDetails &details);
  const TrackedTensor *find(const IpuTensorDetails &details) const;

  TrackedTensor *find(const at::Tensor &t);
  const TrackedTensor *find(const at::Tensor &t) const;
};

} // namespace poptorch

#endif // POPTORCH_DISPATCH_VALUE_MAPPER_HPP_


================================================
FILE: poptorch/source/dispatch_tracer/dispatchers/IDispatch.cpp
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
#include "IDispatch.hpp"

#include <memory>

#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

#include "../CommonHelperFunctions.hpp"
#include "../Tensor.hpp"
#include "poptorch/OpBuilder.hpp"
#include "poptorch/Utils.hpp"

namespace poptorch {

IDispatch::IDispatch(TensorStore *tensor_store) {
  ERROR_ON(tensor_store == nullptr);
  _tensor_store = tensor_store;
}

void IDispatch::setPythonStack(
    const std::vector<torch::jit::StackEntry> &stack) {
  setCurrentCodeLocation(getPythonInterpreterSourceRange(stack));
}

void *IDispatch::getDataSource(torch::jit::Value *value) {
  auto buf = _mapper.getBufferForValue(value);
  if (buf == nullptr) {
    logging::trace("JIT value not tracked {}", reinterpret_cast<void *>(value));
    return nullptr;
  }
  return buf->data();
}

bool IDispatch::isParameter(const at::Tensor &t) const {
  return _mapper.isParameter(t);
}

bool IDispatch::isParameter(torch::jit::Value *value) {
  auto *record = _mapper.rawTensorRecord(value);
  ERROR_ON_MSG(record == nullptr,
               "JIT value not tracked " << reinterpret_cast<void *>(value));
  return record->is_parameter;
}

void IDispatch::setParameterName(const at::Tensor &tensor,
                                 const std::string &name) {
  _mapper.setParameterName(tensor, name);
}

std::string IDispatch::getParameterName(torch::jit::Value *value) const {
  return _mapper.getParameterName(value);
}

void IDispatch::setParameterPerReplica(const std::string &param_name,
                                       const at::Tensor &tensor,
                                       int comm_group_type, int shards,
                                       int variable_retrieval_mode) {
  _mapper.setParameterPerReplica(param_name, tensor, comm_group_type, shards,
                                 variable_retrieval_mode);
}

bool IDispatch::getParameterPerReplica(torch::jit::Value *value,
                                       PerReplicaSettings &settings) const {
  auto res = _mapper.getParameterPerReplica(value);
  if (!res.has_value()) {
    return false;
  }
  settings = std::move(*res);
  return true;
}

void IDispatch::replaceValue(torch::jit::Value *v_old,
                             torch::jit::Value *v_new) {
  _mapper.replaceValue(v_old, v_new);
}

// adapted from torch/csrc/jit/python/python_tracer.cpp because the header file
// had too many dependencies
torch::jit::SourceRange IDispatch::getPythonInterpreterSourceRange(
    const std::vector<torch::jit::StackEntry> &cs) const {

  auto excludes = getSourceLocationExcludes();
  const auto is_filename_excluded = [&](std::string_view filename) {
    const auto excludes_filename = [&filename](std::vector<char> exclude) {
      return filename.find(std::string_view(exclude.data(), exclude.size())) !=
             std::string_view::npos;
    };
    return std::any_of(excludes.begin(), excludes.end(), excludes_filename);
  };

  // transform_reduce
  auto stack_trace = std::accumulate(
      cs.begin(), cs.end(), std::string(),
      [](std::string trace, const torch::jit::StackEntry &entry) {
        auto file_line_col = entry.range.file_line_col();
        if (file_line_col) {
          const auto &[file, line, col] = *file_line_col;
          UNUSED(col);
          trace +=
              file + "(" + std::to_string(line) + "): " + entry.filename + "\n";
        }
        return trace;
      });

  auto val = std::find_if(
      cs.begin(), cs.end(),
      [is_filename_excluded](const torch::jit::StackEntry &entry) {
        auto file_line_col = entry.range.file_line_col();
        if (file_line_col) {
          return !is_filename_excluded(std::get<0>(*file_line_col));
        }
        return false;
      });

  c10::optional<std::string> source_filename;
  std::size_t source_line = 0;
  if (val != cs.end()) {
    std::size_t col = 0;
    std::tie(source_filename, source_line, col) = *val->range.file_line_col();
  }

  auto source = std::make_shared<torch::jit::Source>(
      stack_trace, source_filename, source_line);
  logging::trace("Setting op source to: {}:{}",
                 source_filename.value_or("<unknown>"), source_line);
  return torch::jit::SourceRange(source, 0, stack_trace.size());
}

IDispatch::~IDispatch() { resetCurrentSourceLocation(); }

} // namespace poptorch


================================================
FILE: poptorch/source/dispatch_tracer/dispatchers/IDispatch.hpp
================================================
// Copyright (c) 2021 Graphcore Ltd. All rights reserved.
#ifndef POPTORCH_IDISPATCH_H_
#define POPTORCH_IDISPATCH_H_

#include <ATen/Tensor.h>
#include <ATen/core/boxing/KernelFunction.h>
#include <c10/util/Optional.h>
#include <torch/csrc/jit/frontend/source_range.h>
#include <torch/csrc/jit/ir/ir.h>

#include <string>
#include <vector>

#include "../ValueMapper.hpp"

namespace poptorch {

class IDispatch {
public:
  explicit IDispatch(TensorStore *tensor_store);

  IDispatch(IDispatch &&other) noexcept = default;
  IDispatch &operator=(IDispatch &&other) noexcept = default;
  IDispatch(const IDispatch &other) noexcept = delete;
  IDispatch &operator=(const IDispatch &other) noexcept = delete;

  virtual ~IDispatch();

  // Input tensor is a CPU tensor, returns an IPU tensor.
  virtual void addInput(const at::Tensor &cpu_tensor,
                        const at::Tensor &ipu_tensor) = 0;
  // Constant tensor is a CPU tensor, returns an IPU tensor.
  virtual void addConstant(const at::Tensor &cpu_tensor,
                           const at::Tensor &ipu_tensor) = 0;
  // Input tensor is a CPU tensor, returns an IPU tensor.
  virtual void addParameter(const at::Tensor &cpu_tensor,
                            const at::Tensor &ipu_tensor) = 0;
  // Source tensor is an IPU tensor, destination is a CPU tensor.
  virtual void addOutput(const at::Tensor &ipu_src,
                         const at::Tensor &cpu_dest) = 0;
  virtual void finalizeGraph() = 0;

  void setPythonStack(const std::vector<torch::jit::StackEntry> &stack);

  // The "catch-all" fallback kernel.
  virtual void fallback(const c10::OperatorHandle &op, c10::Stack *stack) = 0;

  virtual void detach(const c10::OperatorHandle &op, c10::Stack *stack,
                      bool moving_parameters) = 0;

  // Rather than have each empty overload requring a specialised kernel we
  // simply ask the dispatchers to acknowledge the created empty tensor and we
  // create it manually in the base function registration.
  virtual void registerEmptyTensor(const at::Tensor &empty, bool is_param) = 0;

  bool isParameter(const at::Tensor &t) const;
  void *getDataSource(torch::jit::Value *val);
  bool isParameter(torch::jit::Value *val);

  void replaceValue(torch::jit::Value *v_old, torch::jit::Value *v_new);

  void setParameterName(const at::Tensor &tensor, const std::string &name);
  std::string getParameterName(torch::jit::Value *val) const;

  void setParameterPerReplica(const std::string &param_name,
                              const at::Tensor &tensor, int comm_group_type,
                              int shards, int variable_retrieval_mode);
  bool getParameterPerReplica(torch::jit::Value *value,
                              PerReplicaSettings &settings) const;

protected:
  // We use the value mapper to map between incoming at::Tensors and JIT/MLIR
  // types.
  ValueMapper _mapper;

  // Used to create and manage tensors. This is a raw pointer to ensure this is
  // trivially copyable, but must never be nullptr.
  TensorStore *_tensor_store;

  virtual const std::vector<std::vector<char>> &
  getSourceLocationExcludes() const = 0;
  virtual void
  setCurrentCodeLocation(const torch::jit::SourceRange &source_location) = 0;

private:
  torch::jit::SourceRange getPythonInterpreterSourceRange(
      const std::vector<torch::jit::StackEntry> &cs) const;
};

} // namespace poptorch

#endif // POPTORCH_IDISPATCH_H_


================================================
FILE: poptorch/source/dispatch_tracer/dispatchers/JitDispatch.cpp
================================================
// Copyright (c) 2021 Graphcore Ltd. All rights reserved.
#include "JitDispatch.hpp"

#include <memory>
#include <string>
#include <unordered_set>
#include <utility>

#include "../../PoptorchSymbols.hpp"
#include "../../popart_canonicalization/PopartCanonicalizationUtils.hpp"

#include "poptorch/OpBuilder.hpp"
#include "poptorch/PopartCanonicalization.hpp"
#include "poptorch/TypeAndConstantCanonicalization.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

#include "pytorch_bridge/CompilerOptions.hpp"

#include "../CommonHelperFunctions.hpp"
#include "../Tensor.hpp"

namespace poptorch {

class WithMetadata {
public:
  explicit WithMetadata(const std::string &metadata) {
    setCurrentMetadata(metadata);
  }
  ~WithMetadata() { setCurrentMetadata(""); }
};

std::string truncateGraphString(torch::jit::Graph &graph) {
  static const int num_lines_max = [=]() {
    if (const char *graph_len = std::getenv("POPTORCH_MAX_GRAPH_LEN")) {
      const int n = std::stoi(graph_len);
      logging::trace("POPTORCH_MAX_GRAPH_LEN={}", n);
      return n;
    }
    const int n = 10;
    logging::trace("POPTORCH_MAX_GRAPH_LEN not set, defaulting to {}", n);
    return n;
  }();

  std::string s = graph.toString();
  if (num_lines_max <= 0 || s.empty()) {
    return s;
  }
  size_t start = s.size();
  for (int i = 0; i < num_lines_max; i++) {
    start = s.rfind('\n', start - 1);
    if (start == std::string::npos) {
      // Didn't find another new line: print everything.
      return s;
    }
  }
  // Start after the last line return.
  return "[...truncated...]" + s.substr(start);
}

JITDispatch::JITDispatch(const CompilerOptions &options,
                         TensorStore *tensor_store)
    : IDispatch(tensor_store), graph(std::make_shared<torch::jit::Graph>()),
      _opts(options), _type_inference_handler(tensor_store) {}

void JITDispatch::addConstant(const at::Tensor &cpu_tensor,
                              const at::Tensor &ipu_tensor) {
  ERROR_ON(!cpu_tensor.unsafeGetTensorImpl()->is_cpu());

  const auto src = cpu_tensor.to(ipu_tensor.scalar_type());

  const WithMetadata metadata("constant");
  auto *value = insertConstant(graph.get(), src);

  logging::trace("[DISPATCHER] Adding constant: Value {} with cpu ptr {}",
                 static_cast<void *>(value), cpu_tensor.data_ptr());

  _mapper.addTensor(ipu_tensor, value, false);
}

void JITDispatch::addTensor(const at::Tensor &cpu_tensor,
                            const at::Tensor &ipu_tensor, bool is_parameter) {
  ERROR_ON(!cpu_tensor.unsafeGetTensorImpl()->is_cpu());
  errorOnZeroSizedTensor(cpu_tensor);

  const auto src = cpu_tensor.to(ipu_tensor.dtype());
  _tensor_store->copyFromCpu(ipu_tensor, src);

  torch::jit::Value *value = graph->addInput(cpu_tensor.name());
  setSourceRangeToCurrentLocation(value->node());
  value->setType(c10::TensorType::create(ipu_tensor)
                     ->withRequiresGrad(cpu_tensor.requires_grad()));

  logging::trace("[DISPATCHER] Adding {}: Value {} with cpu ptr {}",
                 is_parameter ? "parameter" : "input",
                 static_cast<void *>(value), src.data_ptr());
  _inplace_tracker.addTensor(value);

  _mapper.addTensor(ipu_tensor, value, is_parameter);
}

void JITDispatch::addInput(const at::Tensor &cpu_tensor,
                           const at::Tensor &ipu_tensor) {
  const WithMetadata metadata("input");
  addTensor(cpu_tensor, ipu_tensor, /* is_parameter= */ false);
}

void JITDispatch::addParameter(const at::Tensor &cpu_tensor,
                               const at::Tensor &ipu_tensor) {
  const WithMetadata metadata("parameter");
  const at::ScalarType type = cpu_tensor.scalar_type();
  // PopART doesn't allow non-floating point variables so add them as
  // constants instead. These will be deleted from parameters and buffers
  // in python before passed to lowering.
  if (!at::isFloatingType(type)) {
    return addConstant(cpu_tensor, ipu_tensor);
  }
  addTensor(cpu_tensor, ipu_tensor, /* is_parameter= */ true);
}

void JITDispatch::addOutput(const at::Tensor &ipu_src,
                            const at::Tensor &cpu_dest) {
  const WithMetadata metadata("output");
  // The PopART backend will allocate its own buffers: ignore cpu_dest.
  UNUSED(cpu_dest);
  auto *record = _mapper.rawTensorRecord(ipu_src);
  ERROR_ON_MSG(record == nullptr,
               "Internal: graph output tensor not present in value mapper "
                   << static_cast<void *>(&_mapper) << " for "
                   << static_cast<void *>(ipu_src.unsafeGetTensorImpl()));

  torch::jit::Value *output = record->jit;

  // If the output is an input: add an identity op to make sure the graph
  // is not empty.
  for (torch::jit::Value *input : graph->inputs()) {
    if (input == output) {
      auto *none = graph->createNone();
      insertNodeInGraph(graph.get(), none);
      output = createAndInsertNode(graph.get(), c10::aten::clone,
                                   {output, none->output()}, ImplicitCast::None,
                                   OutputType::AsFirstInput)
                   ->output();
      break;
    }
  }

  logging::trace(
      "[DISPATCHER][JIT] Graph output: Tensor ptr {}, jit ir %{} "
      "(scalar type {})",
      reinterpret_cast<void *>(ipu_src.unsafeGetTensorImpl()),
      output->debugNameBase(),
      output->type()->expect<c10::TensorType>()->scalarType().value_or(
          at::ScalarType::Undefined));

  graph->registerOutput(output);
}

void JITDispatch::finalizeGraph() {
  // Clear the code location
  setCurrentPythonCodeLocation({});
}

void JITDispatch::registerEmptyTensor(const at::Tensor &tensor, bool is_param) {
  const WithMetadata metadata("empty");
  // Do not call copyAndCoerceType from this method:
  // the source tensor hasn't been added to the mapper yet.

  // The tensor shouldn't need converting anyway: it should be created with a
  // valid type.
  const auto coerced_scalar_type = coerceToSupportedType(tensor.scalar_type());
  ERROR_ON_MSG(
      coerced_scalar_type != tensor.scalar_type(),
      "[Internal error] The empty tensor should have a valid compiler type");
  // aten::empty.memory_format(int[] size, *, ScalarType? dtype=None,
  //                           Layout? layout=None, Device? device=None,
  //                           bool? pin_memory=None,
  //                           MemoryFormat? memory_format=None) -> Tensor
  auto *g = graph.get();
  auto *const pin_memory = g->createNone();
  auto *const memory_format = g->createNone();
  insertNodeInGraph(g, pin_memory);
  insertNodeInGraph(g, memory_format);
  torch::jit::Node *n = createAndInsertNode(
      g, c10::aten::empty,
      {insertConstant(g, tensor.sizes()),
       insertConstant(g, tensor.scalar_type()),
       insertConstant(g, tensor.layout()), insertConstant(g, tensor.device()),
       pin_memory->output(), memory_format->output()});
  n->output()->inferTypeFrom(tensor);
  setSourceRangeToCurrentLocation(n);
  _mapper.addTensor(tensor, n->output(), is_param);
}

// aten::detach(Tensor(a) self) -> (Tensor(a))
void JITDispatch::detach(const c10::OperatorHandle &op, c10::Stack *stack,
                         bool moving_parameters) {
  // We only handle the special case when we're moving parameters here. If we're
  // not moving parameters, we'll defer to the fallback and actually create a
  // dispatch op on the PopART graph.
  if (!moving_parameters) {
    fallback(op, stack);
    return;
  }

  const c10::FunctionSchema &schema = op.schema();
  const auto num_arguments = schema.arguments().size();
  const auto arguments = torch::jit::last(stack, num_arguments);

  ERROR_ON(arguments.size() != 1);
  const at::Tensor in = arguments.front().toTensor();

  const at::Tensor out(in.unsafeGetTensorImpl()->shallow_copy_and_detach(
      /*version_counter=*/in.unsafeGetTensorImpl()->version_counter(),
      /*allow_tensor_metadata_change=*/true));

  // The new tensor points at the same mlir tensor as the source.
  _mapper.addTensor(out, _mapper.getValueForTensor(in), true);

  torch::jit::drop(stack, num_arguments);
  torch::jit::push(stack, out);
}

const std::vector<std::vector<char>> &
JITDispatch::getSourceLocationExcludes() const {
  return _opts.dispatcher.source_location_excludes;
}

void JITDispatch::setCurrentCodeLocation(
    const torch::jit::SourceRange &source_location) {
  setCurrentPythonCodeLocation(source_location);
}

// Convert the operation into our normal IR style operation.
void JITDispatch::fixOutput(c10::Stack &stack, torch::jit::Node *node) {
  // Fix up the outputs.
  std::uint32_t output_index = 0;
  for (const c10::IValue &value : stack) {
    // Add any missing outputs. They frequently return scalars which we just
    // ignore here as our canonicalisation only returns tensors.
    while (output_index >= node->outputs().size()) {
      node->addOutput();
    }

    // Start tracking the output tensors, i.e. add them to the value mapper.
    torch::jit::Value *val = node->output(output_index);

    if (value.isTensor()) {
      const at::Tensor tensor = value.toTensor();

      val->inferTypeFrom(tensor);

      _mapper.addTensor(tensor, val, false);

      logging::trace(
          "[DISPATCHER][JIT] Output: Tensor ptr {}, jit ir %{} (scalar type "
          "{})",
          reinterpret_cast<void *>(tensor.unsafeGetTensorImpl()),
          val->debugNameBase(),
          val->type()->expect<c10::TensorType>()->scalarType().value_or(
              at::ScalarType::Undefined));
    } else if (value.isTensorList()) {
      logging::trace("[DISPATCHER][JIT] Output tensor list: jit ir %{}",
                     val->debugName());
      val->setType(value.type()->expect<c10::ListType>());
      const auto tensor_list = value.toTensorVector();
      // Always insert list unpack if output value is a list.
      auto *const unpack = graph->createListUnpack(val, tensor_list.size());
      insertNodeInGraph(graph.get(), unpack);

      for (size_t i = 0; i < tensor_list.size(); ++i) {
        const at::Tensor &tensor = tensor_list.at(i);
        val = unpack->output(i);
        val->inferTypeFrom(copyAndCoerceType(tensor));
        _mapper.addTensor(tensor, val, false);
        logging::trace("[DISPATCHER][JIT] Output tensor list element: Tensor "
                       "ptr {}, jit ir %{} {}",
                       reinterpret_cast<void *>(tensor.unsafeGetTensorImpl()),
                       val->debugNameBase(), toString(tensor));
      }
    }

    output_index++;
  }
}

void JITDispatch::fallback(const c10::OperatorHandle &op, c10::Stack *stack) {
  const c10::FunctionSchema &schema = op.schema();
  // Run through the schema to find out if one of the operators is supposed to
  // be inplace, this could be the 'out' argument of a non-inplace op.
  const std::vector<at::Tensor> inplace_tensors =
      getInplaceArguments(*stack, schema);
  const std::size_t num_inplace_tensors = inplace_tensors.size();
  std::vector<torch::jit::Value *> aliased_inputs(num_inplace_tensors, nullptr);

  if (!inplace_tensors.empty()) {
    std::transform(inplace_tensors.cbegin(), inplace_tensors.cend(),
                   aliased_inputs.begin(), [&](const auto &inplace_tensor) {
                     return _inplace_tracker.eraseCurrentAlias(
                         _mapper.getValueForTensor(inplace_tensor));
                   });
  }

  // Tag all the nodes created by the handler with the initial schema string
  // representation so that they can be traced back to top level ops in the
  // profiler.
  const WithMetadata metadata(c10::toString(schema));

  // Create a fake IR node for us to target using the schema.
  torch::jit::Node *node = lowerFromSchema(schema, stack, *graph, _mapper);
  logging::trace("[DISPATCHER][JIT] Node from schema {}", *node);

  if (!inplace_tensors.empty()) {
    // For inplace ops, cast all input tensors to the same type as the output
    // tensor.

    for (std::size_t ouput_tensor_id = 0; ouput_tensor_id < num_inplace_tensors;
         ++ouput_tensor_id) {
      const auto output_type =
          inplace_tensors.at(ouput_tensor_id).scalar_type();
      const bool output_float = c10::isFloatingType(output_type);
      for (size_t i = 0; i < stack->size(); i++) {
        const c10::IValue &sv = (*stack).at(i);
        if (!sv.isTensor()) {
          continue;
        }
        const at::Tensor &tensor = sv.toTensor();
        const auto input_type = tensor.scalar_type();
        const bool input_float = c10::isFloatingType(input_type);
        if (input_type == at::ScalarType::Undefined ||
            input_type == output_type || input_float != output_float ||
            !canCast(input_type, output_type)) {
          continue;
        }

        // Save where nodes will be inserted in the graph.
        auto *const curr_insert_point = graph->insertPoint();
        // Set insertion point before `node`.
        graph->setInsertPoint(node);

        torch::jit::Value *jv = node->input(i);
        torch::jit::Node *cast =
            createAndInsertCastOp(graph.get(), jv, output_type);
        node->replaceInputWith(jv, cast->output());

        // Restore old insertion point.
        graph->setInsertPoint(curr_insert_point);
      }
    }
  }

  // The MLIR dispatcher is going to use the shape and type of the inputs to
  // infer the shape and type of the outputs so we need to create dummy MLIR
  // tensors for each input.
  const std::function<void(const c10::IValue &value)> process_value =
      [&](const c10::IValue &value) {
        if (value.isList()) {
          for (const auto &v : value.toList()) {
            process_value(v);
          }
        } else if (value.isTensor()) {
          const at::Tensor &tensor = value.toTensor();
          // Sometimes Undefined is used to mark an optional tensor as not set.
          if (tensor.scalar_type() == at::ScalarType::Undefined) {
            ERROR_ON_MSG(
                tensor.numel() != 0,
                "[Internal error] Non-empty tensor of type 'Undefined'");
            // No need to register the tensor if it's undefined.
            return;
          }
        } else {
          // If this assertion is hit then we need to add support for this kind
          // of value by going through the container and identifying all the
          // tensors.
          ERROR_ON_MSG(value.isTuple() || value.isGenericDict(),
                       "[Internal] Support for container "
                           << value.tagKind() << " not implemented");
        }
      };
  for (const c10::IValue &value : *stack) {
    process_value(value);
  }
  _type_inference_handler.inferOutputTypes(op, stack);

  // Fix the fake tensor so it can still work with our canonicalisation
  // functions which check the output.
  fixOutput(*stack, node);

  logging::trace("[DISPATCHER][JIT] Pre canonicalisation {}", *node);

  std::size_t i = 0;
  for (c10::IValue value : *stack) {
    if (value.isTensor()) {
      const at::Tensor tensor = value.toTensor();
      logging::trace(
          "[DISPATCHER][JIT] Node tensor output at index {} size: ={}", i++,
          tensor.sizes());
    } else {
      logging::trace("[DISPATCHER][JIT] Node scalar output at index {}", i++);
    }
  }

  // Switcheroo the output so the inplace tensor reference is now pointing to
  // the output.
  if (!inplace_tensors.empty()) {
    for (std::size_t ouput_tensor_id = 0; ouput_tensor_id < num_inplace_tensors;
         ++ouput_tensor_id) {
      const at::Tensor output = stack->at(ouput_tensor_id).toTensor();

      // Get the jit value we are tracking for the output.
      torch::jit::Value *const value = _mapper.getValueForTensor(output);
      // If the modified inplace tensor was an alias for an input then
      // register the new alias.
      if (!aliased_inputs.empty()) {
        const auto &aliased_input = aliased_inputs.at(ouput_tensor_id);
        if (aliased_input != nullptr) {
          _inplace_tracker.registerAlias(aliased_input, value);
        }
      }

      // Overwrite the inplace tensor with that jit. Now a reference to the
      // inplace tensor correctly points to this outplace value.
      const auto &inplace_tensor = inplace_tensors.at(ouput_tensor_id);
      ValueMapper::TrackedTensor *const record =
          _mapper.rawTensorRecord(inplace_tensor);

      ERROR_ON_MSG(
          !record,
          "[DISPATCHER][JIT] Inplace op is not tracking inplace argument");

      // Ensure the value and torch tensor shapes match
      const JitTensorInfo value_info(value);
      inplace_tensor.unsafeGetTensorImpl()->set_sizes_contiguous(
          value_info.dims);

      // Validate to make sure the data type also matches.
      validateTensorShapeAndType(value, inplace_tensor);
      record->jit = value;
    }
  }
}

InplaceGraphInfo
JITDispatch::finalizeInplaceGraphInfo(size_t num_anchors,
                                      bool replicas_needing_broadcast) {
  return _inplace_tracker.finalizeGraph(*graph, num_anchors,
                                        replicas_needing_broadcast);
}

} // namespace poptorch


================================================
FILE: poptorch/source/dispatch_tracer/dispatchers/JitDispatch.hpp
================================================
// Copyright (c) 2021 Graphcore Ltd. All rights reserved.
#ifndef POPTORCH_DISPATCH_JIT_DISPATCH_HPP_
#define POPTORCH_DISPATCH_JIT_DISPATCH_HPP_

#include <torch/csrc/jit/ir/ir.h>

#include <memory>
#include <string>
#include <unordered_map>
#include <vector>

#include "pytorch_bridge/CompilerOptions.hpp"

#include "../TypeInferenceHandler.hpp"
#include "../ValueMapper.hpp"
#include "IDispatch.hpp"

#include "poptorch/InplaceOps.hpp"

namespace poptorch {
struct CompilerOptions;

class JITDispatch final : public IDispatch {
public:
  JITDispatch(const CompilerOptions &options, TensorStore *tensor_store);

  // The JIT graph we are building up.
  std::shared_ptr<torch::jit::Graph> graph;

  void addConstant(const at::Tensor &cpu_tensor,
                   const at::Tensor &ipu_tensor) final;
  void addInput(const at::Tensor &cpu_tensor,
                const at::Tensor &ipu_tensor) final;
  void addParameter(const at::Tensor &cpu_tensor,
                    const at::Tensor &ipu_tensor) final;
  void addOutput(const at::Tensor &ipu_src, const at::Tensor &cpu_dest) final;
  void finalizeGraph() final;

  void fallback(const c10::OperatorHandle &op, c10::Stack *stack) override;

  void detach(const c10::OperatorHandle &op, c10::Stack *stack,
              bool moving_parameters) final;

  void registerEmptyTensor(const at::Tensor &tensor, bool is_param) final;

  // Node will be updated to the new target post canonicalisation.
  void fixOutput(c10::Stack &stack, torch::jit::Node *node);

  InplaceGraphInfo finalizeInplaceGraphInfo(size_t num_anchors,
                                            bool replicas_needing_broadcast);

private:
  void addTensor(const at::Tensor &cpu_tensor, const at::Tensor &ipu_tensor,
                 bool is_parameter);

  const std::vector<std::vector<char>> &getSourceLocationExcludes() const final;
  void
  setCurrentCodeLocation(const torch::jit::SourceRange &source_location) final;

  CompilerOptions _opts;
  TypeInferenceHandler _type_inference_handler;
  InplaceInputsTracker _inplace_tracker;
};

} // namespace poptorch

#endif // POPTORCH_DISPATCH_JIT_DISPATCH_HPP_


================================================
FILE: poptorch/source/include/poptorch/AliasProcessing.hpp
================================================
// Copyright (c) 2021 Graphcore Ltd. All rights reserved.
#ifndef INCLUDE_POPTORCH_ALIAS_PROCESSING_H
#define INCLUDE_POPTORCH_ALIAS_PROCESSING_H

namespace torch {
namespace jit {
struct Graph;
} // namespace jit
} // namespace torch

namespace poptorch {

// Remove instances of aten::alias in the graph by replacing the outputs with
// the original (aliased) output. The known source of aliases is when an
// operation takes place on a wrapped buffer, for which the return value tensor
// is aliased and then set to be a member of the original (wrapper) subclass.
void resolveAliases(torch::jit::Graph *graph);

} // namespace poptorch

#endif // INCLUDE_POPTORCH_ALIAS_PROCESSING_H


================================================
FILE: poptorch/source/include/poptorch/CompilerOps.inc.hpp
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
// Auto generated file, do not modify
// Run `python3 scripts/PopParse.py` to regenerate
// clang-format off

torch::jit::Node* createCopyvarupdate(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createBatchnormalization(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,unsigned int num_outputs,float epsilon,float momentum, unsigned int num_node_outputs);
torch::jit::Node* createBucketize(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args, bool right);
torch::jit::Node* createGroupnormalization(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t num_groups,float epsilon);
torch::jit::Node* createSubsample(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::vector<int64_t> & strides);
torch::jit::Node* createPrinttensor(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t print_gradient,const std::string & title,const int summariseThreshold,const int edgeItems,const int maxLineWidth,const int digits,const int floatFormat,const char separator,const char openBracket,const char closeBracket);
torch::jit::Node* createNop(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createScale(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,float scale);
torch::jit::Node* createScaledadd(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,float scale0,float scale1);
torch::jit::Node* createLstm(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t outputFullSequence);
torch::jit::Node* createGelu(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createGeluErf(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createDetach(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createDepthtospace(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t blocksize,const std::string & mode);
torch::jit::Node* createRound(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createNearbyInt(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createDynamicslice(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::vector<int64_t> axes,std::vector<int64_t> sizes,std::int32_t noOverlap);
torch::jit::Node* createDynamicupdate(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::vector<int64_t> axes,std::vector<int64_t> sizes,std::int32_t noOverlap);
torch::jit::Node* createDynamiczero(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::vector<int64_t> axes,std::vector<int64_t> sizes);
torch::jit::Node* createDynamicadd(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::vector<int64_t> axes,std::vector<int64_t> sizes);
torch::jit::Node* createSequenceslice(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::int32_t zeroUnused);
torch::jit::Node* createL1loss(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const float lambda,std::int32_t reduction);
torch::jit::Node* createNllloss(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::int32_t reduction,std::int32_t ignoreIndex,bool inputIsLogProbability);
torch::jit::Node* createIdentityloss(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::int32_t reduction);
torch::jit::Node* create_ctcloss(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::int32_t reduction,const unsigned int blank,const std::string & outDataType,const bool zeroInfinity);
torch::jit::Node* createCtcbeamsearchdecoder(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,unsigned int blank,unsigned int beamWidth,unsigned int topPaths);
torch::jit::Node* createShapeddropout(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::vector<int64_t> & shape,float ratio);
torch::jit::Node* createAtan2(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createExpm1(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createLog1p(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createFmod(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createRemainder(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createReverse(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::vector<int64_t> & dimensions);
torch::jit::Node* createSlice(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::vector<int64_t> & ends,const std::vector<int64_t> & starts,const std::vector<int64_t> & axes);
torch::jit::Node* createBitwisenot(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createBitwiseand(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createBitwiseor(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createBitwisexor(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createBitwisexnor(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createReducemedian(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::vector<int64_t> axes,int64_t keepdims);
torch::jit::Node* createScatterreduce(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::int32_t axis_size,std::int32_t axis,bool enable_index_broadcast, std::int32_t reduction);
torch::jit::Node* createGroupedscatterreduce(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::int32_t axis_size,std::int32_t axis,std::int32_t group_size, bool enable_index_broadcast, std::int32_t reduction);
torch::jit::Node* createSwish(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createAveragepool(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::vector<int64_t> & kernel_shape,int64_t ceil_mode,int64_t count_include_pad,const std::vector<int64_t> & pads,const std::vector<int64_t> & strides);
torch::jit::Node* createConvinteger(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::vector<int64_t> & dilations,int64_t group,const std::vector<int64_t> & kernel_shape,const std::vector<int64_t> & pads,const std::vector<int64_t> & strides);
torch::jit::Node* createDequantizelinear(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createDropout(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,unsigned int num_outputs,float ratio);
torch::jit::Node* createIsinf(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t detect_negative,int64_t detect_positive);
torch::jit::Node* createMatmulinteger(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createMaxpool(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,unsigned int num_outputs,const std::vector<int64_t> & kernel_shape,int64_t ceil_mode,const std::vector<int64_t> & dilations,const std::vector<int64_t> & pads,int64_t storage_order,const std::vector<int64_t> & strides);
torch::jit::Node* createMod(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t fmod);
torch::jit::Node* createNonmaxsuppression(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t center_point_box);
torch::jit::Node* createQlinearconv(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::vector<int64_t> & dilations,int64_t group,const std::vector<int64_t> & kernel_shape,const std::vector<int64_t> & pads,const std::vector<int64_t> & strides);
torch::jit::Node* createQlinearmatmul(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createQuantizelinear(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createResize(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::string &coordinate_transformation_mode,float cubic_coeff_a,int64_t exclude_outside,float extrapolation_value,const std::string & mode,const std::string &nearest_mode);
torch::jit::Node* createReversesequence(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t batch_axis,int64_t time_axis);
torch::jit::Node* createRoialign(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::string & mode,int64_t output_height,int64_t output_width,int64_t sampling_ratio,float spatial_scale);
torch::jit::Node* createThresholdedrelu(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,float alpha);
torch::jit::Node* createTopk(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t axis, bool largest, bool sorted);
torch::jit::Node* createSort(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t axis, bool descending, bool stable);
torch::jit::Node* createUpsample(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::string & mode);
torch::jit::Node* createAcosh(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createAsinh(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createAtanh(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createCast(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::string & to);
torch::jit::Node* createCompress(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::int32_t axis);
torch::jit::Node* createCosh(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createErf(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createEyelike(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::int32_t dtype,int64_t k);
torch::jit::Node* createFlatten(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t axis);
torch::jit::Node* createGemm(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,float alpha,float beta,int64_t transA,int64_t transB);
torch::jit::Node* createGreater(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createIsnan(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createLess(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createMatmul(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createMaxunpool(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::vector<int64_t> & kernel_shape,const std::vector<int64_t> & pads,const std::vector<int64_t> & strides);
torch::jit::Node* createMeanvariancenormalization(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::vector<int64_t> & axes);
torch::jit::Node* createNonzero(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createOnehot(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t axis);
torch::jit::Node* createScatter(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t axis);
torch::jit::Node* createScatterElements(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t axis);
torch::jit::Node* createShrink(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,float bias,float lambd);
torch::jit::Node* createSign(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createSinh(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createTfidfvectorizer(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t max_gram_length,int64_t max_skip_count,int64_t min_gram_length,const std::string & mode,const std::vector<int64_t> & ngram_counts,const std::vector<int64_t> & ngram_indexes,const std::vector<int64_t> & pool_int64s,const std::vector<std::string> & pool_strings,std::vector<double> weights);
torch::jit::Node* createWhere(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createExpand(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createMax(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createMean(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createMin(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createSum(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createAcos(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createAdd(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createLogical_and(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createAsin(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createAtan(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createCos(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createDiv(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createEqual(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createMul(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createMultinomial(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t dtype,int64_t sample_size,float seed);
torch::jit::Node* createLogical_or(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createPow(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createSin(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createSub(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createTan(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createLogical_xor(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createAbs(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createArgmax(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t axis,int64_t keepdims);
torch::jit::Node* createArgmin(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t axis,int64_t keepdims);
torch::jit::Node* createCeil(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createClip(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createConcat(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t axis);
torch::jit::Node* createConv(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::vector<int64_t> & dilations,int64_t group,const std::vector<int64_t> & kernel_shape,const std::vector<int64_t> & pads,const std::vector<int64_t> & strides);
torch::jit::Node* createConvtranspose(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::vector<int64_t> & dilations,int64_t group,const std::vector<int64_t> & kernel_shape,const std::vector<int64_t> & output_padding,const std::vector<int64_t> & output_shape,const std::vector<int64_t> & pads,const std::vector<int64_t> & strides);
torch::jit::Node* createElu(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,float alpha);
torch::jit::Node* createExp(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createFloor(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createGather(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t axis);
torch::jit::Node* createGroupedgather(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t axis,int64_t group_size);
torch::jit::Node* createGlobalaveragepool(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createGloballppool(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t p);
torch::jit::Node* createGlobalmaxpool(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createHardsigmoid(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,float alpha,float beta);
torch::jit::Node* createHardmax(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t axis);
torch::jit::Node* createIdentity(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createInstancenormalization(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,float epsilon);
torch::jit::Node* createLrn(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t size,float alpha,float beta,float bias);
torch::jit::Node* createLeakyrelu(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,float alpha);
torch::jit::Node* createLog(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createLogsoftmax(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t axis);
torch::jit::Node* createLpnormalization(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t axis,int64_t p);
torch::jit::Node* createLppool(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::vector<int64_t> & kernel_shape,int64_t p,const std::vector<int64_t> & pads,const std::vector<int64_t> & strides);
torch::jit::Node* createMaxroipool(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::vector<int64_t> & pooled_shape,float spatial_scale);
torch::jit::Node* createNeg(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createLogical_not(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createPad(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::string & mode);
torch::jit::Node* createRandomnormallike(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::int32_t dtype,float mean,float scale,float seed);
torch::jit::Node* createRandomuniformlike(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::int32_t dtype,float high,float low,float seed);
torch::jit::Node* createReciprocal(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createReducel1(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::vector<int64_t> axes,int64_t keepdims);
torch::jit::Node* createReducel2(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::vector<int64_t> axes,int64_t keepdims);
torch::jit::Node* createReducelogsum(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::vector<int64_t> axes,int64_t keepdims);
torch::jit::Node* createReducelogsumexp(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::vector<int64_t> axes,int64_t keepdims);
torch::jit::Node* createReducemax(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::vector<int64_t> axes,int64_t keepdims);
torch::jit::Node* createReducemean(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::vector<int64_t> axes,int64_t keepdims);
torch::jit::Node* createReducemin(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::vector<int64_t> axes,int64_t keepdims);
torch::jit::Node* createReduceprod(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::vector<int64_t> axes,int64_t keepdims);
torch::jit::Node* createReducesum(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::vector<int64_t> axes,int64_t keepdims);
torch::jit::Node* createReducesumsquare(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::vector<int64_t> axes,int64_t keepdims);
torch::jit::Node* createRelu(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createSelu(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,float alpha,float gamma);
torch::jit::Node* createShape(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createSigmoid(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createSize(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createSoftmax(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t axis);
torch::jit::Node* createSoftplus(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createSoftsign(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createSpacetodepth(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,int64_t blocksize);
torch::jit::Node* createSplinebasis(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,std::int32_t degree);
torch::jit::Node* createSplineweighting(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createSplit(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,unsigned int num_outputs,int64_t axis,const std::vector<int64_t> & split);
torch::jit::Node* createSqrt(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createSqueeze(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::vector<int64_t> & axes);
torch::jit::Node* createTanh(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createTile(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args);
torch::jit::Node* createTranspose(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::vector<int64_t> & perm);
torch::jit::Node* createUnsqueeze(torch::jit::Graph *graph,  const std::vector<torch::jit::Value *>& args,const std::vector<int64_t> & axes);


================================================
FILE: poptorch/source/include/poptorch/ImplicitCasting.hpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#ifndef INCLUDE_POPTORCH_IMPLICIT_CASTING_HPP
#define INCLUDE_POPTORCH_IMPLICIT_CASTING_HPP
#include <vector>

namespace c10 {
template <typename T> class ArrayRef;
} // namespace c10

namespace torch {
namespace jit {
template <class T> using ArrayRef = c10::ArrayRef<T>;
struct Graph;
struct Value;
} // namespace jit
} // namespace torch

namespace poptorch {

enum class ImplicitCast {
  None,
  All,
  ExceptFirst,
  ExceptSecond,
  ExceptThird,
  ExceptFourthFifth
};

enum class ImplicitCastOutput { None, AsPromoted, AlwaysBool, AlwaysFloat };

std::vector<torch::jit::Value *>
implicitCastInputs(torch::jit::ArrayRef<torch::jit::Value *> *inputs,
                   ImplicitCast implicit_cast);

// TODO(T55228): remove after we use our own dispatch key.
// With the dispatcher we catch implicit torch casts (intercepted with
// JitDispatch::toCopyInplace) but it seems that in the case of CPU tensors,
// the returned (casted) aten tensors are not reflected in the later ops, i.e.
// we might end up with dead implicit casts in the ir which we clean with this
// pass. The actual poptorch casting is done in our canonicalization handlers
// anyway.
void removeDeadImplicitCasts(torch::jit::Graph *graph);

} // namespace poptorch

#endif // INCLUDE_POPTORCH_IMPLICIT_CASTING_HPP


================================================
FILE: poptorch/source/include/poptorch/InplaceOpsPyTorch.hpp_nolint
================================================
// Copyright (c) 2021 Graphcore Ltd. All rights reserved.
// Any Modifications to code from PyTorch

// From PyTorch:

// Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
// Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
// Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
// Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
// Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
// Copyright (c) 2011-2013 NYU                      (Clement Farabet)
// Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
// Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
// Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)

// From Caffe2:

// Copyright (c) 2016-present, Facebook Inc. All rights reserved.

// All contributions by Facebook:
// Copyright (c) 2016 Facebook Inc.

// All contributions by Google:
// Copyright (c) 2015 Google Inc.
// All rights reserved.

// All contributions by Yangqing Jia:
// Copyright (c) 2015 Yangqing Jia
// All rights reserved.

// All contributions by Kakao Brain:
// Copyright 2019-2020 Kakao Brain

// All contributions from Caffe:
// Copyright(c) 2013, 2014, 2015, the respective contributors
// All rights reserved.

// All other contributions:
// Copyright(c) 2015, 2016 the respective contributors
// All rights reserved.

// Caffe2 uses a copyright model similar to Caffe: each contributor holds
// copyright over their contributions to Caffe2. The project versioning records
// all such contribution and copyright details. If a contributor wants to further
// mark their specific copyright on a particular contribution, they should
// indicate their copyright solely in the commit message of the change when it is
// committed.

// All rights reserved.

// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:

// 1. Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.

// 2. Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.

// 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
//    and IDIAP Research Institute nor the names of its contributors may be
//    used to endorse or promote products derived from this software without
//    specific prior written permission.

// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.


// From torch/csrc/jit/passes/remove_inplace_ops.cpp which is
// inaccesible from outside the module

namespace torch {
namespace jit {
// Modify from source to handle only those which cannot be formed by removing
// trailing _
static const std::unordered_map<NodeKind, NodeKind> inPlaceToOutOfPlace = {
    {aten::zero_, aten::zeros_like},
    {aten::fill_, aten::full_like}};

static const std::unordered_map<NodeKind, int> expectedInputCount = {
    {aten::zero_, 6}, {aten::fill_, 7}};

// Modify from source to handle other in place ops not in the list
bool isInplaceOp(const Node* node) {
  const char *kind_str = node->kind().toQualString();
  size_t str_length = strlen(kind_str);

  if (str_length < 2) { return false; }

  // Handle ops like aten::__and__
  if (kind_str[str_length-2] == '_') { return false; }

  return kind_str[str_length-1] == '_';
}

} // namespace jit
} // namespace torch


================================================
FILE: poptorch/source/include/poptorch/OpBuilder.hpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#ifndef INCLUDE_POPTORCH_OP_BUILDER_HPP
#define INCLUDE_POPTORCH_OP_BUILDER_HPP
#include <torch/csrc/jit/ir/ir.h>

#include <functional>
#include <string>
#include <tuple>
#include <unordered_map>
#include <utility>
#include <vector>

#include "poptorch/ImplicitCasting.hpp"

#include "poptorch_logging/Error.hpp"

// Represents how the output type of the op is to be determined
enum class OutputType {
  Unknown,
  AsFirstInput,
  AsThirdInput,
  FirstAsFirstInputSecondAlwaysInt,
  AsImplicitCastPromoted,
  AsDtype,
  AsDtypeOrAsPromoted,
  AlwaysBool,
  AlwaysFloat,
  AlwaysInt,
  AlwaysUint8
};

namespace c10 {
template <class T> class optional;
} // namespace c10

namespace poptorch {

// RAII object to set / clear the current source code location
// and metadata to those attached to the provided node.
// (Useful when creating / replacing nodes in the graph).
// [Important] This is not a stack: the metadata is cleared on
// destruction.
class WithNodeMetadata {
public:
  explicit WithNodeMetadata(torch::jit::Node *node);
  ~WithNodeMetadata();
};

// Set the current source code location (i.e all the nodes created
// will appear as having been instantiated from that location).
void setCurrentPythonCodeLocation(
    const torch::jit::SourceRange &source_location);

// Set the current metadata. (All the nodes created
// will have this metadata attached to them).
void setCurrentMetadata(const std::string &metadata);

void resetCurrentSourceLocation();

torch::jit::Node *createNode(
    torch::jit::Graph *graph, torch::jit::NodeKind kind,
    torch::jit::ArrayRef<torch::jit::Value *> inputs = {},
    ImplicitCast implicit_cast = ImplicitCast::None,
    OutputType output_type = OutputType::Unknown, size_t num_outputs = 1,
    c10::optional<at::ScalarType> dtype = c10::optional<at::ScalarType>());

torch::jit::Node *createAndInsertNode(
    torch::jit::Graph *graph, torch::jit::NodeKind kind,
    torch::jit::ArrayRef<torch::jit::Value *> inputs = {},
    ImplicitCast implicit_cast = ImplicitCast::None,
    OutputType output_type = OutputType::Unknown, size_t num_outputs = 1,
    c10::optional<at::ScalarType> dtype = c10::optional<at::ScalarType>());

// All nodes should be added to the jit graph using one of the below `insert`
// functions (or indirectly by using createAndInsertNode()).
// These functions will ensure the new node contains all the required metadata
// before it's added to the graph.
void insertNodeInGraph(torch::jit::Graph *graph, torch::jit::Node *new_node);

void insertNodeBeforeNode(torch::jit::Node *new_node,
                          torch::jit::Node *insert_point);

void insertNodeAfterNode(torch::jit::Node *new_node,
                         torch::jit::Node *insert_point);

torch::jit::Value *insertConstant(torch::jit::Graph *graph,
                                  const torch::jit::IValue &val);

void setSourceRangeToCurrentLocation(torch::jit::Node *node);

// Called by createAndInsertNode except in the cases of OutputType::AsDtype and
// OutputType::AsDtypeOrFirstInput where it should be called manually once the
// dtype attribute is set
void setNodeOutputsTypes(torch::jit::Node *node, ImplicitCast implicit_cast,
                         OutputType output_type);

enum class UseOfNode { HostSideOnly, PopARTOnly, HostSideAndPopART };

// Create a poptorch::tensor_constant, poptorch::host_side_tensor_constant
// or poptorch::host_and_ipu_side_tensor_constant node from the given tensors,
// setting the output type accordingly.
// A constant which is simply returned, perhaps as a tuple or list, is labelled
// as a host side constant to prevent it being placed in PopART. A constant
// which is both returned unchanged and used in PopART needs a further pass to
// split it into two constants.
torch::jit::Node *
tensorToConstant(torch::jit::Graph *graph, const at::Tensor &t,
                 UseOfNode constant_use = UseOfNode::PopARTOnly);

// Manually added.
torch::jit::Node *createReshape(torch::jit::Graph *graph, torch::jit::Value *A,
                                const std::vector<int64_t> &new_shape);

torch::jit::Node *
createConstantLong(torch::jit::Graph *graph,
                   const std::vector<std::int64_t> &data,
                   const std::vector<std::int64_t> &new_shape);

torch::jit::Node *createConstantInt(torch::jit::Graph *graph,
                                    const std::vector<std::int64_t> &data,
                                    const std::vector<std::int64_t> &new_shape);

torch::jit::Node *
createConstantFloat32(torch::jit::Graph *graph, const std::vector<double> &data,
                      const std::vector<std::int64_t> &new_shape);

// Create a constant float that inherits its underlying type (float16/32) from
// tensor t
torch::jit::Node *
createConstantFloatLike(torch::jit::Graph *graph, torch::jit::Value *t,
                        const std::vector<double> &data,
                        const std::vector<std::int64_t> &new_shape);

template <typename SymbolHandler>
torch::jit::Node *
createHandlerOperation(torch::jit::Graph *graph, SymbolHandler &&handler,
                       torch::jit::ArrayRef<torch::jit::Value *> inputs) {
  torch::jit::Node *inputs_node = graph->createTuple(inputs);
  return handler(graph, inputs_node);
}

torch::jit::Node *
createCustomOperation(torch::jit::Graph *graph,
                      const std::vector<torch::jit::Value *> &inputs,
                      const std::string &name, const std::string &domain,
                      std::int64_t domainVersion, std::int64_t numOutputs,
                      const std::string &attributes_id_str);

torch::jit::Node *createCast(torch::jit::Graph *graph, torch::jit::Value *A,
                             c10::ScalarType scalar);

torch::jit::Node *createInternalCast(torch::jit::Graph *graph,
                                     torch::jit::Value *A,
                                     const std::string &type);

torch::jit::Node *createConstantPad(torch::jit::Graph *graph,
                                    torch::jit::Value *A,
                                    const std::vector<int64_t> &pad_shape,
                                    float constant,
                                    bool direct_pad_shape_input = false);

torch::jit::Node *createReflectionPad(torch::jit::Graph *graph,
                                      torch::jit::Value *A,
                                      const std::vector<int64_t> &pad_shape);

torch::jit::Node *createEdgePad(torch::jit::Graph *graph, torch::jit::Value *A,
                                const std::vector<int64_t> &pad_shape);

torch::jit::Node *createAddNotInPlace(torch::jit::Graph *graph,
                                      torch::jit::Value *A,
                                      torch::jit::Value *B);

torch::jit::Node *createStartForLoop(torch::jit::Graph *graph,
                                     torch::jit::Value *inputs);

torch::jit::Node *createEndForLoop(torch::jit::Graph *graph,
                                   torch::jit::Value *outputs,
                                   torch::jit::Value *inputs,
                                   std::int64_t trip_count);

torch::jit::Node *createStartIfBlock(torch::jit::Graph *graph,
                                     torch::jit::Value *condition);

torch::jit::Node *createStartElseBlock(torch::jit::Graph *graph,
                                       torch::jit::Value *outputs_then);

torch::jit::Node *createEndIfBlock(torch::jit::Graph *graph,
                                   torch::jit::Value *outputs_else,
                                   torch::jit::Value *condition);

torch::jit::Node *createAddUntypedInputTensor(torch::jit::Graph *graph,
                                              torch::jit::Value *input);

// Create an add output to mark a node of being an output of a subgraph.
torch::jit::Node *createAddOutputTensor(torch::jit::Graph *graph,
                                        torch::jit::Value *output);

torch::jit::Value *wrapInConstantVec(torch::jit::Graph *graph,
                                     const std::vector<int64_t> &data);

template <typename... Elms>
using FirstElmType = typename std::tuple_element<0, std::tuple<Elms...>>::type;

template <
    typename... Ints,
    std::enable_if_t<std::is_integral<FirstElmType<Ints...>>::value, int> = 0>
torch::jit::Value *wrapInConstant1D(torch::jit::Graph *graph, Ints... values) {
  std::vector<int64_t> data{std::forward<Ints>(values)...};
  return wrapInConstantVec(graph, data);
}

template <typename T> struct CreateCast {};

template <> struct CreateCast<float> {
  torch::jit::Node *operator()(torch::jit::Graph *graph,
                               torch::jit::Value *value) const {
    return createCast(graph, value, c10::kFloat);
  }
};

template <> struct CreateCast<std::int32_t> {
  torch::jit::Node *operator()(torch::jit::Graph *graph,
                               torch::jit::Value *value) const {
    return createCast(graph, value, c10::kInt);
  }
};

template <> struct CreateCast<std::int64_t> {
  torch::jit::Node *operator()(torch::jit::Graph *graph,
                               torch::jit::Value *value) const {
    return createCast(graph, value, c10::kLong);
  }
};

template <typename T>
torch::jit::Node *castToType(torch::jit::Graph *graph,
                             torch::jit::Value *value) {
  return CreateCast<T>{}(graph, value);
}

torch::jit::Node *
createOptimizerGroup(torch::jit::Graph *graph, std::uint64_t group,
                     const std::vector<torch::jit::Value *> &list_of_params);

torch::jit::Node *createRecomputationCheckpoint(torch::jit::Graph *graph,
                                                torch::jit::Value *value);

torch::jit::Node *createUnfold(torch::jit::Graph *graph,
                               torch::jit::Value *value, int64_t dimension,
                               int64_t size, int64_t step);

torch::jit::Node *
createRandomNormal(torch::jit::Graph *graph,
                   const std::vector<torch::jit::Value *> &possible_inputs,
                   const std::vector<int64_t> &shape, float mean, float scale,
                   at::ScalarType dataType = at::ScalarType::Undefined);

torch::jit::Node *
createRandomUniform(torch::jit::Graph *graph, torch::jit::Value *possible_input,
                    const std::vector<int64_t> &shape, float high, float low,
                    at::ScalarType dataType = at::ScalarType::Undefined);

torch::jit::Node *createPrintIpuTensor(torch::jit::Graph *graph,
                                       torch::jit::Value *value,
                                       const std::string &title);

torch::jit::Node *createCallCpuOp(torch::jit::Graph *graph,
                                  const std::vector<torch::jit::Value *> &value,
                                  const std::string &id,
                                  torch::jit::Node *node);

torch::jit::Node *createSetAvailableMemory(torch::jit::Graph *graph,
                                           torch::jit::Value *value,
                                           float proportion);

torch::jit::Node *createSetAttribute(torch::jit::Graph *graph,
                                     const std::string &attribute,
                                     const std::string &key,
                                     const std::string &value,
                                     bool insert_after_insertion_pnt = false);

torch::jit::Node *createClearAttribute(torch::jit::Graph *graph,
                                       const std::string &attribute,
                                       const std::string &key,
                                       bool insert_after_insertion_pnt = false);

torch::jit::Node *createSetMatMulSerialization(torch::jit::Graph *graph,
                                               torch::jit::Value *matmul,
                                               const std::string &mode,
                                               int64_t factor,
                                               bool keep_precision);

torch::jit::Node *createBeginIpuBlock(torch::jit::Graph *graph,
                                      std::uint64_t stage, std::int64_t phase,
                                      std::int64_t ipu);

torch::jit::Node *createMultiConvPart(torch::jit::Graph *graph,
                                      torch::jit::Node *conv_node);

torch::jit::Node *createGru(torch::jit::Graph *graph,
                            const std::vector<torch::jit::Value *> &args,
                            int64_t hidden_size);

torch::jit::Node *createRnn(torch::jit::Graph *graph,
                            const std::vector<torch::jit::Value *> &args,
                            const std::vector<std::string> &activations);

torch::jit::Node *createPrelu(torch::jit::Graph *graph, torch::jit::Value *self,
                              torch::jit::Value *weight);

// Autogenerated.
#include "poptorch/CompilerOps.inc.hpp"

} // namespace poptorch

#endif // INCLUDE_POPTORCH_OP_BUILDER_HPP


================================================
FILE: poptorch/source/include/poptorch/OverlappedIO.hpp
================================================
// Copyright (c) 2021 Graphcore Ltd. All rights reserved.
#ifndef INCLUDE_POPTORCH_OVERLAPPED_IO_H
#define INCLUDE_POPTORCH_OVERLAPPED_IO_H

namespace torch {
namespace jit {
struct Graph;

} // namespace jit
} // namespace torch

namespace poptorch {

// Turns any set_overlap_for_input nodes applied to inputs into attributes of
// the parameter node. These attributes specify any host IO Overlapped for the
// input
void attributiseOverlappedIO(torch::jit::Graph *graph);

} // namespace poptorch

#endif // INCLUDE_POPTORCH_OVERLAPPED_IO_H


================================================
FILE: poptorch/source/include/poptorch/PopartCanonicalization.hpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#ifndef INCLUDE_POPTORCH_TRANSFORM_ATEN_TO_POPART_HPP_
#define INCLUDE_POPTORCH_TRANSFORM_ATEN_TO_POPART_HPP_

#include <string>
#include <unordered_map>
#include <vector>

namespace torch {
namespace jit {
struct Graph;
struct Node;
} // namespace jit
} // namespace torch

namespace at {
class Tensor;
} // namespace at

namespace poptorch {
/*
   The first canonicalization pass cleans up the pytorch IR to use popart
   specific operations and will remove all others. Constants will be folded into
   the attributes of the ops themselves.
*/
void canonicalize(torch::jit::Graph *graph);

/*
 * The second late canonicalization pass will take the popart code and will
 * enforce any constraints that aren't fixed by popart itself.
 */
void canonicalizeLate(torch::jit::Graph *graph);

/*
 * Error if any Aten ops remain in the graph after we have run canonicalisation
 * so the user can report exactly what operation we are missing.
 */
void errorOnUnsupportedAten(torch::jit::Graph *graph);

void annotateSubgraphs(torch::jit::Graph *graph, torch::jit::Node *start_node);

void removeSurplusIdentityLosses(torch::jit::Graph *graph);

// Clean up the graph if it is using CPU offloading.
void cpuOffloadingCleanup(torch::jit::Graph *graph);

// Handle the 'requires_grad=False' flag on tensors.
void addDetachOperations(torch::jit::Graph *graph);

// Popart scatterreduceop allows for non-expanded index to be passed in. It is
// essentially a fused and more efficient version of the expand + scatterreduce.
// This pass identifies all of the valid optimization cases and removes the
// explicit and sub-optimal index expansion before the scatter_add ops.
void removeScatterAddIndexExpansion(torch::jit::Graph *graph);

// Combine possibly scatter operations to execute a grouped version.
void groupScatterReduceAndGatherNodes(torch::jit::Graph *graph);

// PyTorch's `gather` works differently to PopART's (aka. PyTorch's
// `index_select`), but in certain cases when the indices tensor has been
// passed through an `expand`, they're equivalent (if the non-expanded indices
// are used). Swapping out the handling saves some ops, but is also more
// efficient if the expanded indices tensor is just a long series of slices.
void simplifyGatherWithExpandedIndices(torch::jit::Graph *graph);

// Adds the op as the possible true input op to set_available_memory if it is
// of a valid kind.
// Some ops are composed of multiple ops, and their return values might not be
// an op that accepts set_available_memory.
void setAvailableMemoryAddPossibleInputOp(torch::jit::Node *node);

// Ensure that the input to the given set_available_memory op is the one that
// supports set_available_memory, if it's not move it to the right place.
void moveSetAvailableMemoryIfRequired(torch::jit::Node *node);
} // namespace poptorch

#endif // INCLUDE_POPTORCH_TRANSFORM_ATEN_TO_POPART_HPP_


================================================
FILE: poptorch/source/include/poptorch/RequiresGrad.hpp
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
#ifndef INCLUDE_POPTORCH_REQUIRES_GRAD_H
#define INCLUDE_POPTORCH_REQUIRES_GRAD_H

namespace torch {
namespace jit {
struct Graph;
} // namespace jit
} // namespace torch

namespace poptorch {

// Autograd sets the requires_grad flag on the ATen tensors
// after we've instantiated the corresponding ATen node in the dispatcher.
// This pass goes through all the nodes in the ATen graph and sets the
// requires_graph flag on a node's outputs if any of its inputs has
// requires_grad set.
void fixRequiresGradFromDispatch(torch::jit::Graph *graph);

} // namespace poptorch

#endif // INCLUDE_POPTORCH_REQUIRES_GRAD_H


================================================
FILE: poptorch/source/include/poptorch/TypeAndConstantCanonicalization.hpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#ifndef INCLUDE_POPTORCH_TYPE_AND_CONSTANT_CANONICALIZATION_H
#define INCLUDE_POPTORCH_TYPE_AND_CONSTANT_CANONICALIZATION_H

#include <sstream>
#include <string>
#include <vector>

namespace at {
class Tensor;
} // namespace at

namespace c10 {
struct Symbol;
} // namespace c10

namespace torch {
namespace jit {
struct Graph;
struct Node;
} // namespace jit
} // namespace torch

namespace poptorch {
namespace type_and_constant_canonicalization {

// Add the number of elements of the list to the type by replacing it with
// ListTypeWithNumElements instances. The PyTorch ListType does not contain
// the number of elements. If revert is "true", reverts all such types to the
// original ListType.
void addListNumElements(torch::jit::Graph *graph, bool revert = false);

void evaluateConstexprs(torch::jit::Graph *graph);

// Turn non-floating point parameters into constants as these are not supported
// in popart. The pass also removes the affected graph inputs and modifies
// 'parameter_names' and 'traced_parameter_tensors' accordingly.
void makeConstantIntParams(torch::jit::Graph *graph,
                           std::vector<std::string> &parameter_names,
                           std::vector<at::Tensor> &traced_parameter_tensors);

// Change the graph to add a poptorch::host_side_cast node after every graph
// input whose type is unsupported (Long, Double, BFloat16) to reflect the
// casting which would happen on the host and the correct types as they
// would be on the graph.
void castUnsupportedInputs(torch::jit::Graph *graph);

// Change any unsupported output types to the appropriate equivalent (e.g.
// double to float) and warn; error on any totally unsupported types e.g. 8 bit.
void checkAndChangeOutputTypes(torch::jit::Graph *graph);

// Changes all constants used in implicit casting operations into tensor
// constants (poptorch::tensor_constant) of the correct type.
void canonicaliseConstants(torch::jit::Graph *graph);

} // namespace type_and_constant_canonicalization
} // namespace poptorch

#endif // INCLUDE_POPTORCH_TYPE_AND_CONSTANT_CANONICALIZATION_H


================================================
FILE: poptorch/source/popart_canonicalization/ActivationOps.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#include <spdlog/fmt/fmt.h>
#include <spdlog/fmt/ostr.h>

#include "../PoptorchStaticInit.hpp"
#include "PopartCanonicalizationUtils.hpp"

#include "poptorch/DispatchTracer.hpp"
#include "poptorch/OpBuilder.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

namespace poptorch {
namespace {

torch::jit::Node *gluHandler(torch::jit::Graph *graph, torch::jit::Node *node) {
  // "aten::glu(Tensor self, int dim) -> Tensor"
  // The input IR before canonicalization:
  // %3 : Float(2:96, 4:24, 6:4, 4:1) = aten::glu(%input, %4)

  // The output IR after canonicalization. It takes 3 steps.
  // 1. split the intput into two halves
  // %5 : FloatTensor, %6 : FloatTensor = popart::split[num_outputs=2, axis=3,
  // split=[4, 4]](%input)
  // 2. sigmoid the 2nd half
  // %7 : FloatTensor = popart::sigmoid(%6)
  // 3. multiply the 1st half and the sigmoid result
  // %8 : Float(2:96, 4:24, 6:4, 4:1) = popart::mul(%5, %7)

  // Input
  torch::jit::Value *input = node->input(0);
  std::int64_t axis = constantToLong(node->input(1)->node());
  const std::vector<std::int64_t> shape_input = shapeFromTensor(input);
  const std::int64_t size = shape_input.size();

  // handle python's negative indices
  if (axis < 0) {
    axis += size;
  }

  ERROR_ON_MSG(axis < 0 || axis >= size,
               "The second input argument of glu is not in the legal range");

  ERROR_ON_MSG(shape_input[axis] % 2,
               "Halving dimension" << axis << "must be even");

  const unsigned int half_size =
      static_cast<unsigned int>(shape_input[axis] / 2);

  const std::vector<std::int64_t> split_sizes = {half_size, half_size};

  torch::jit::Node *split = createSplit(graph, {input}, 2, axis, split_sizes);
  torch::jit::Node *sigmoid = createSigmoid(graph, {split->output(1)});

  return createMul(graph, {split->output(0), sigmoid->output()});
}

torch::jit::Node *rreluHandler(torch::jit::Graph *graph,
                               torch::jit::Node *node) {
  // clang-format off
  // aten::rrelu(Tensor self, Scalar lower=0.125,
  //             Scalar upper=0.3333333333333333,
  //             bool training=False, Generator? generator=None) -> Tensor
  // aten::rrelu_with_noise(Tensor self, Tensor noise,
  //                        Scalar lower, Scalar upper,
  //                        bool training, Generator? generator) -> Tensor
  //
  // training: rrelu(x)  = x if x >= 0
  //                     = a * x if x < 0, where a uniformly random value
  //                                       from [lower, upper]
  // inference: rrelu(x) = x if x >= 0
  //                     = x * ((lower + upper) / 2)
  // clang-format on
  torch::jit::Value *x = node->input(0);
  int64_t next_idx = 1;
  if (node->kind() == c10::aten::rrelu_with_noise) {
    next_idx++; // skip noise parameter
    logging::warn("Noise parameter not supported for aten::rrelu_with_noise");
  }
  const float lower = constantToFloat(node->input(next_idx++)->node());
  const float upper = constantToFloat(node->input(next_idx++)->node());
  const bool is_training = constantToBool(node->input(next_idx++)->node());

  auto *val =
      is_training
          ? createRandomUniform(graph, x, shapeFromTensor(x), upper, lower)
                ->output()
          : createConstantFloatLike(graph, x, {(lower + upper) / 2}, {})
                ->output();

  auto *zero = createConstantFloatLike(graph, x, {0}, {})->output();
  auto *xlt0 = createLess(graph, {x, zero})->output();
  auto *mul = createMul(graph, {x, val})->output();
  return createWhere(graph, {xlt0, mul, x});
}

torch::jit::Node *softplusHandler(torch::jit::Graph *graph,
                                  torch::jit::Node *node) {
  auto *x = node->input(0);
  auto input_type = getNodeScalarType(x);
  auto beta = constantToFloat(node->input(1)->node());
  auto threshold = constantToFloat(node->input(2)->node());

  const auto msg =
      fmt::format("{{\"beta\":{},\"threshold\":{}}}", beta, threshold);
  auto *output_node = createCustomOperation(graph, {x}, "TorchSoftplus",
                                            "poptorch.custom_ops", 1, 1, msg);

  output_node->output(0)->setType(c10::TensorType::create(
      input_type, c10::nullopt, c10::nullopt, c10::nullopt));
  return output_node;
}

torch::jit::Node *hardsigmoidHandler(torch::jit::Graph *graph,
                                     torch::jit::Node *node) {
  auto *x = node->input(0);
  // hardsigmoid(x, 1/6, 0.5)
  return createHardsigmoid(graph, {x}, 1.0 / 6.0, 0.5);
}

torch::jit::Node *hardswishHandler(torch::jit::Graph *graph,
                                   torch::jit::Node *node) {
  auto *x = node->input(0);
  auto *t0 = createConstantFloatLike(graph, x, {0.0}, {})->output();
  auto *t1 = createMax(graph, {x, t0})->output();
  auto *t2 = createAbs(graph, {x})->output();
  auto *t3 = createConstantFloatLike(graph, x, {3.0}, {})->output();
  auto *t4 = createGreater(graph, {t2, t3})->output();
  auto *t5 = createAdd(graph, {x, t3})->output();
  auto *t6 = createMul(graph, {x, t5})->output();
  auto *t7 = createConstantFloatLike(graph, x, {6.0}, {})->output();
  auto *t8 = createDiv(graph, {t6, t7})->output();
  // where(greater(abs(x), 3), max(x, 0), (x + 3) * x / 6.0)
  return createWhere(graph, {t4, t1, t8});
}

torch::jit::Node *preluHandler(torch::jit::Graph *graph,
                               torch::jit::Node *node) {
  auto *self = node->input(0);
  auto *weight = node->input(1);

  return createPrelu(graph, self, weight);
}

torch::jit::Node *geluHandler(torch::jit::Graph *graph,
                              torch::jit::Node *node) {
  auto *input = node->input(0);
  const auto approximate = constantToString(node->input(1)->node());

  if (approximate == "tanh") {
    return createGelu(graph, {input});
  }
  if (approximate == "none") {
    // TODO: use createGeluErf when it will reach sufficient
    // performance
    return createGelu(graph, {input});
  }

  ERROR("Unknown GELU approximate '" << approximate << "'");
}

torch::jit::Node *mishHandler(torch::jit::Graph *graph,
                              torch::jit::Node *node) {
  auto *src = node->input(0);
  auto *const neg_src = createNeg(graph, {src});
  auto *const sigm = createSigmoid(graph, {neg_src->output()});
  auto *const mul = createMul(graph, {sigm->output(), sigm->output()});

  const auto shape = shapeFromTensor(mul->output());
  const size_t size = std::accumulate(shape.cbegin(), shape.cend(), 1,
                                      std::multiplies<size_t>());
  const std::vector<double> ones_vec(size, 1.0);
  auto *const one = createConstantFloat32(graph, ones_vec, shape);
  auto *const one_minus_mul = createSub(graph, {one->output(), mul->output()});
  auto *const one_plus_mul = createAdd(graph, {one->output(), mul->output()});
  auto *const div =
      createDiv(graph, {one_minus_mul->output(), one_plus_mul->output()});
  return createMul(graph, {src, div->output()});
}

} // namespace

__attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() {
  registerHandler(c10::aten::glu, gluHandler);
  registerHandler(c10::aten::rrelu, rreluHandler);
  registerHandler(c10::aten::rrelu_with_noise, rreluHandler);
  registerHandler(c10::aten::softplus, softplusHandler);
  registerHandler(c10::aten::hardsigmoid, hardsigmoidHandler);
  registerHandler(c10::aten::hardswish, hardswishHandler);
  registerHandler(c10::aten::prelu, preluHandler);
  registerHandler(c10::aten::_prelu_kernel, preluHandler);
  registerHandler(c10::aten::gelu, geluHandler);
  registerHandler(c10::aten::mish, mishHandler);
}

} // namespace poptorch


================================================
FILE: poptorch/source/popart_canonicalization/ArithmeticOps.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#include <ATen/core/jit_type.h>
#include <c10/core/ScalarType.h>

#include "../PoptorchStaticInit.hpp"
#include "PopartCanonicalizationUtils.hpp"

#include "../PoptorchSymbols.hpp"
#include "poptorch/OpBuilder.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"

namespace poptorch {
namespace {

torch::jit::Node *addHandler(torch::jit::Graph *graph, torch::jit::Node *node) {
  // aten::add(Tensor self, Tensor other, *, Scalar alpha) -> Tensor
  // aten::add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) ->
  // (Tensor(a!))
  torch::jit::Value *alpha_param = node->input(2);

  // If both types are bool, use logical_or
  if (allInputsBool(node, 2)) {
    ERROR_ON(!hasUnityValue(alpha_param));
    return createLogical_or(graph, {node->input(0), node->input(1)});
  }

  // Ordinary addition
  torch::jit::Value *alpha_multiplicand = node->input(1);
  if (!hasUnityValue(alpha_param)) {
    auto *alpha_node = createMul(graph, {alpha_param, alpha_multiplicand});
    alpha_multiplicand = alpha_node->output();
  }
  return createAdd(graph, {node->input(0), alpha_multiplicand});
}

torch::jit::Node *truncHandler(torch::jit::Graph *graph,
                               torch::jit::Node *node) {
  // Drop the exponent by casting to int and back.
  torch::jit::Node *to_int = createCast(graph, node->input(), c10::kInt);

  return createCast(
      graph, to_int->output(),
      *node->input()->type()->expect<c10::TensorType>()->scalarType());
}

torch::jit::Node *fracHandler(torch::jit::Graph *graph,
                              torch::jit::Node *node) {
  // Frac(x) = x - trunc(x)

  // Drop the exponent by casting to int and back.
  torch::jit::Node *to_int = createCast(graph, node->input(0), c10::kInt);

  torch::jit::Node *trunc = createCast(
      graph, to_int->output(),
      *node->input(0)->type()->expect<c10::TensorType>()->scalarType());

  return createSub(graph, {node->input(0), trunc->output()});
}

torch::jit::Node *floorDivideHandler(torch::jit::Graph *graph,
                                     torch::jit::Node *node) {
  // aten::floor_divide(Tensor x, Tensor y) -> Tensor
  // floor_divide(x, y) = floor(x/y) where floor(...) rounds towards -inf.

  torch::jit::Node *quotient =
      createDiv(graph, {node->input(0), node->input(1)});
  return createFloor(graph, {quotient->output()});
}

torch::jit::Node *mulHandler(torch::jit::Graph *graph, torch::jit::Node *node) {
  // aten::mul(Tensor self, Tensor other) -> Tensor

  // If both types are bool, use logical_add
  if (allInputsBool(node)) {
    return createLogical_and(graph, {node->input(0), node->input(1)});
  }

  // Ordinary multiplication
  return createMul(graph, {node->input(0), node->input(1)});
}

torch::jit::Node *trueDivideHandler(torch::jit::Graph *graph,
                                    torch::jit::Node *node) {
  // aten::true_divide(Tensor x, Tensor y) -> Tensor
  // true_divide(x, y) = (float)x / (float)y

  torch::jit::Node *x = createCast(graph, node->input(0), c10::kFloat);

  torch::jit::Node *y = createCast(graph, node->input(1), c10::kFloat);

  return createDiv(graph, {x->output(), y->output()});
}

torch::jit::Node *clampHandler(torch::jit::Graph *graph,
                               torch::jit::Node *node) {
  // We can't use PopART clip because it doesn't support integers,
  // so the following is used instead:
  // output = min(max(x, min_value), max_value)

  auto *x = node->input(0);
  auto *min_val = node->input(1);
  auto *max_val = node->input(2);

  c10::ScalarType const x_type = getNodeScalarType(x);
  if (x_type != c10::kInt) {
    if (!isNone(min_val->node()) && getNodeScalarType(min_val) != x_type) {
      min_val = createCast(graph, node->input(1), x_type)->output();
    }
    if (!isNone(max_val->node()) && getNodeScalarType(max_val) != x_type) {
      max_val = createCast(graph, node->input(2), x_type)->output();
    }
  }

  auto *max =
      isNone(min_val->node()) ? x->node() : createMax(graph, {x, min_val});
  auto *min = isNone(max_val->node())
                  ? max
                  : createMin(graph, {max->output(), max_val});

  return min;
}

torch::jit::Node *clampMinHandler(torch::jit::Graph *graph,
                                  torch::jit::Node *node) {
  auto *max = graph->createNone()->output();
  auto *input = node->input(0);
  auto *min = node->input(1);
  auto clamp_handler = getHandler(c10::aten::clamp);
  return createHandlerOperation(graph, clamp_handler, {input, min, max});
}

torch::jit::Node *clampMaxHandler(torch::jit::Graph *graph,
                                  torch::jit::Node *node) {
  auto *min = graph->createNone()->output();
  auto *input = node->input(0);
  auto *max = node->input(1);
  auto clamp_handler = getHandler(c10::aten::clamp);
  return createHandlerOperation(graph, clamp_handler, {input, min, max});
}

torch::jit::Node *addCDivHandler(torch::jit::Graph *graph,
                                 torch::jit::Node *node) {
  // aten::addcdiv.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar
  // value=1, Tensor(a!) out) -> Tensor(a!)
  torch::jit::Node *div = createDiv(graph, {node->input(1), node->input(2)});
  auto scale = constantToFloat(node->input(3)->node());
  torch::jit::Node *scaled = createScale(graph, {div->output()}, scale);
  return createAdd(graph, {node->input(0), scaled->output()});
}

torch::jit::Node *addCMulHandler(torch::jit::Graph *graph,
                                 torch::jit::Node *node) {
  // aten::addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar
  // value=1) -> Tensor
  torch::jit::Node *mul = createMul(graph, {node->input(1), node->input(2)});
  auto scale = constantToFloat(node->input(3)->node());
  torch::jit::Node *scaled = createScale(graph, {mul->output()}, scale);
  return createAdd(graph, {node->input(0), scaled->output()});
}

torch::jit::Node *crossHandler(torch::jit::Graph *graph,
                               torch::jit::Node *node) {
  auto *x = node->input(0);
  auto *y = node->input(1);
  auto *opt_axis = node->input(2)->node();

  auto x_shape = shapeFromTensor(x);
  auto y_shape = shapeFromTensor(y);

  ERROR_ON_MSG(x_shape.size() != y_shape.size(),
               "Cross product tensors must have same rank");
  for (unsigned i = 0; i < x_shape.size(); ++i) {
    ERROR_ON_MSG(x_shape[i] != y_shape[i],
                 "Cross product tensors must have same shape");
  }

  unsigned axis = 0;
  if (isNone(opt_axis)) {
    // if unspecified, the axis is the first to have dimension 3
    for (unsigned i = 0; i < x_shape.size(); ++i) {
      if (x_shape[i] == 3) {
        axis = i;
        break;
      }
    }
  } else {
    axis = constantToInt(opt_axis);
  }
  ERROR_ON_MSG(x_shape[axis] != 3,
               "Cross product product axis must have dimension 3");

  auto *indices = createConstantInt(graph, {2, 0, 1}, {3})->output();

  // circular permutation right by 1 along the axis
  auto *x_roll = createGather(graph, {x, indices}, axis)->output();
  auto *y_roll = createGather(graph, {y, indices}, axis)->output();

  // products of one straight input with the other input permuted
  auto *mul_x_y_roll = createMul(graph, {x, y_roll})->output();
  auto *mul_y_x_roll = createMul(graph, {y, x_roll})->output();

  // subtraction produces result permuted one position left
  auto *result_roll = createSub(graph, {mul_y_x_roll, mul_x_y_roll})->output();

  // permute to compute final result
  return createGather(graph, {result_roll, indices}, axis);
}

std::pair<torch::jit::Value *, torch::jit::Value *>
calculateVarMean(torch::jit::Graph *graph,
                 const c10::ArrayRef<torch::jit::Value *> &inputs,
                 const std::string &op_name) {
  auto *x = inputs[0];
  auto shape = shapeFromTensor(x);
  std::vector<int64_t> dims;
  // If true, bessel's correction is applied
  bool unbiased = false;
  bool keepdim = false;
  switch (inputs.size()) {
  case 2: {
    // aten::var(Tensor input, bool unbiased)
    dims.resize(shape.size());
    // dims are unspecified so reduce over all
    std::iota(dims.begin(), dims.end(), 0);
    unbiased = constantToBool(inputs[1]->node());
  } break;
  case 4: {
    // aten::var(Tensor input, int[] dim, bool unbiased, bool keepdim)
    // or torch.var.correction(Tensor input, int[]? dim, *, bool unbiased, bool
    // keepdim) from the compiler
    if (inputs[1]->node()->kind() == c10::prim::ListConstruct) {
      dims = constantToLongVec(inputs[1]->node());
    } else {
      dims.resize(shape.size());
      // dims are unspecified so reduce over all
      std::iota(dims.begin(), dims.end(), 0);
    }
    unbiased = constantToBool(inputs[2]->node());
    keepdim = constantToBool(inputs[3]->node());
  } break;
  default:
    ERROR("Invalid number of arguments to aten::" << op_name);
  }

  // Keep the reduced dims so we can broadcast for the subtraction
  auto *mean_keepdim = createReducemean(graph, {x}, dims, 1)->output();
  // Also keep a copy without singleton dims so we can pass to
  auto *mean = createSqueeze(graph, {mean_keepdim}, dims)->output();
  auto *x_minus_mean = createSub(graph, {x, mean_keepdim})->output();
  auto *x_minus_mean_sqr =
      createMul(graph, {x_minus_mean, x_minus_mean})->output();

  auto *var = createReducemean(graph, {x_minus_mean_sqr}, dims,
                               static_cast<int64_t>(keepdim))
                  ->output();
  if (unbiased) {
    // Apply bessel's correction by multipling the biased variance by
    // n / (n - 1), where n is the sample size
    std::int64_t numel_reduced = 1;
    for (auto dim : dims) {
      if (dim < 0) {
        dim += shape.size();
      }
      numel_reduced *= shape[dim];
    }
    const double n = static_cast<double>(numel_reduced);
    auto *unbiased_factor =
        createConstantFloatLike(graph, x, {n / (n - 1)}, {});
    var = createMul(graph, {var, unbiased_factor->output()})->output();
  }
  return {var, mean};
}

torch::jit::Node *varHandler(torch::jit::Graph *graph, torch::jit::Node *node) {
  // aten::var(Tensor input, bool unbiased)
  // aten::var(Tensor input, int[] dim, bool unbiased, bool keepdim)
  return calculateVarMean(graph, node->inputs(), "var").first->node();
}

torch::jit::Node *varMeanHandler(torch::jit::Graph *graph,
                                 torch::jit::Node *node) {
  // aten::var_mean(Tensor input, bool unbiased) -> (Tensor, Tensor)
  // aten::var_mean(Tensor input, int[] dim, bool unbiased, bool keepdim)
  // -> (Tensor, Tensor)
  auto var_mean = calculateVarMean(graph, node->inputs(), "var_mean");

  replaceOutputUse(node->output(0), var_mean.first);
  replaceOutputUse(node->output(1), var_mean.second);

  markNodeForDeletion(node);
  return nullptr;
}

torch::jit::Node *stdHandler(torch::jit::Graph *graph, torch::jit::Node *node) {
  // aten::std(Tensor input, bool unbiased)
  // aten::std(Tensor input, int[] dim, bool unbiased, bool keepdim)
  auto *var = calculateVarMean(graph, node->inputs(), "std").first->node();
  return createSqrt(graph, {var->output()});
}

torch::jit::Node *stdMeanHandler(torch::jit::Graph *graph,
                                 torch::jit::Node *node) {
  // aten::std_mean(Tensor input, bool unbiased) -> (Tensor, Tensor)
  // aten::std_mean(Tensor input, int[] dim, bool unbiased, bool keepdim)
  // -> (Tensor, Tensor)
  auto var_mean = calculateVarMean(graph, node->inputs(), "std_mean");
  auto *std = createSqrt(graph, {var_mean.first});

  replaceOutputUse(node->output(0), std->output());
  replaceOutputUse(node->output(1), var_mean.second);

  markNodeForDeletion(node);
  return nullptr;
}

} // namespace

__attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() {
  registerHandler(c10::aten::add, addHandler);
  registerHandler(c10::aten::trunc, truncHandler);
  registerHandler(c10::aten::frac, fracHandler);
  registerHandler(c10::aten::floor_divide, floorDivideHandler);
  registerHandler(c10::aten::mul, mulHandler);
  registerHandler(c10::aten::true_divide, trueDivideHandler);
  registerHandler(c10::aten::clamp, clampHandler);
  registerHandler(c10::aten::clamp_min, clampMinHandler);
  registerHandler(c10::aten::clamp_max, clampMaxHandler);
  registerHandler(c10::aten::addcdiv, addCDivHandler);
  registerHandler(c10::aten::addcmul, addCMulHandler);
  registerHandler(c10::aten::cross, crossHandler);
  registerHandler(c10::aten::linalg_cross, crossHandler);
  registerHandler(c10::aten::var, varHandler);
  registerHandler(c10::aten::var_mean, varMeanHandler);
  registerHandler(c10::aten::std, stdHandler);
  registerHandler(c10::aten::std_mean, stdMeanHandler);
}

} // namespace poptorch


================================================
FILE: poptorch/source/popart_canonicalization/AtenHandlers.gen.cpp
================================================
// DO NOT EDIT! Generated by PopAtenHandlers.py
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.

#include "../PoptorchStaticInit.hpp"
#include "../PoptorchSymbols.hpp"
#include "PopartCanonicalizationUtils.hpp"
#include "poptorch/OpBuilder.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"
#include <iostream>
namespace poptorch {

namespace {

torch::jit::Node *absHandler(torch::jit::Graph *graph, torch::jit::Node *node) {
  auto *i0 = node->input(0);
  // abs(i0)
  return createAbs(graph, {i0});
}

torch::jit::Node *acosHandler(torch::jit::Graph *graph,
                              torch::jit::Node *node) {
  auto *i0 = node->input(0);
  // acos(i0)
  return createAcos(graph, {i0});
}

torch::jit::Node *acoshHandler(torch::jit::Graph *graph,
                               torch::jit::Node *node) {
  auto *i0 = node->input(0);
  // acosh(i0)
  return createAcosh(graph, {i0});
}

torch::jit::Node *addmmHandler(torch::jit::Graph *graph,
                               torch::jit::Node *node) {
  auto *y = node->input(1);
  auto *z = node->input(2);
  auto *x = node->input(0);
  auto *alpha = node->input(4);
  auto t0 = constantToFloat(alpha->node());
  auto *beta = node->input(3);
  auto t1 = constantToFloat(beta->node());
  // gemm(y, z, x, cfloat(alpha), cfloat(beta), 0, 0)
  return createGemm(graph, {y, z, x}, t0, t1, 0, 0);
}

torch::jit::Node *asinHandler(torch::jit::Graph *graph,
                              torch::jit::Node *node) {
  auto *i0 = node->input(0);
  // asin(i0)
  return createAsin(graph, {i0});
}

torch::jit::Node *asinhHandler(torch::jit::Graph *graph,
                               torch::jit::Node *node) {
  auto *i0 = node->input(0);
  // asinh(i0)
  return createAsinh(graph, {i0});
}

torch::jit::Node *atanHandler(torch::jit::Graph *graph,
                              torch::jit::Node *node) {
  auto *i0 = node->input(0);
  // atan(i0)
  return createAtan(graph, {i0});
}

torch::jit::Node *atan2Handler(torch::jit::Graph *graph,
                               torch::jit::Node *node) {
  auto *i0 = node->input(0);
  auto *i1 = node->input(1);
  // atan2(i0, i1)
  return createAtan2(graph, {i0, i1});
}

torch::jit::Node *atanhHandler(torch::jit::Graph *graph,
                               torch::jit::Node *node) {
  auto *i0 = node->input(0);
  // atanh(i0)
  return createAtanh(graph, {i0});
}

torch::jit::Node *catHandler(torch::jit::Graph *graph, torch::jit::Node *node) {
  auto *x = node->input(0);
  auto t0 = handleTensorList(x->node());
  auto *y = node->input(1);
  auto t1 = constantToLong(y->node());
  // concat(TensorList(x), clong(y))
  return createConcat(graph, {t0}, t1);
}

torch::jit::Node *ceilHandler(torch::jit::Graph *graph,
                              torch::jit::Node *node) {
  auto *i0 = node->input(0);
  // ceil(i0)
  return createCeil(graph, {i0});
}

torch::jit::Node *celuHandler(torch::jit::Graph *graph,
                              torch::jit::Node *node) {
  auto *x = node->input(0);
  auto *a = node->input(1);
  auto *t0 = createDiv(graph, {x, a})->output();
  // matched expm1: sub(exp(x), 1.0)
  auto *t1 = createExpm1(graph, {t0})->output();
  auto *t2 = createMul(graph, {a, t1})->output();
  auto *t3 = createConstantFloatLike(graph, x, {0.0}, {})->output();
  auto *t4 = createMax(graph, {x, t3})->output();
  auto *t5 = createMin(graph, {t3, t2})->output();
  // add(max(x, 0.0), min(0.0, mul(a, expm1(div(x, a)))))
  return createAdd(graph, {t4, t5});
}

torch::jit::Node *constantPadNdHandler(torch::jit::Graph *graph,
                                       torch::jit::Node *node) {
  auto *x = node->input(0);
  auto *l = node->input(1);
  auto t0 = constantToLongVec(l->node());
  auto *c = node->input(2);
  auto t1 = constantToFloat(c->node());
  // constantPad(x, clong_list(l), cfloat(c))
  return createConstantPad(graph, x, t0, t1);
}

torch::jit::Node *cosHandler(torch::jit::Graph *graph, torch::jit::Node *node) {
  auto *i0 = node->input(0);
  // cos(i0)
  return createCos(graph, {i0});
}

torch::jit::Node *coshHandler(torch::jit::Graph *graph,
                              torch::jit::Node *node) {
  auto *i0 = node->input(0);
  // cosh(i0)
  return createCosh(graph, {i0});
}

torch::jit::Node *detachHandler(torch::jit::Graph *graph,
                                torch::jit::Node *node) {
  auto *i0 = node->input(0);
  // detach(i0)
  return createDetach(graph, {i0});
}

torch::jit::Node *divHandler(torch::jit::Graph *graph, torch::jit::Node *node) {
  auto *i0 = node->input(0);
  auto *i1 = node->input(1);
  // div(i0, i1)
  return createDiv(graph, {i0, i1});
}

torch::jit::Node *eluHandler(torch::jit::Graph *graph, torch::jit::Node *node) {
  auto *x = node->input(0);
  auto *y = node->input(1);
  auto t0 = constantToFloat(y->node());
  auto *z = node->input(2);
  auto t1 = constantToFloat(z->node());
  // selu(x, cfloat(y), cfloat(z))
  return createSelu(graph, {x}, t0, t1);
}

torch::jit::Node *eqHandler(torch::jit::Graph *graph, torch::jit::Node *node) {
  auto *i0 = node->input(0);
  auto *i1 = node->input(1);
  // equal(i0, i1)
  return createEqual(graph, {i0, i1});
}

torch::jit::Node *erfHandler(torch::jit::Graph *graph, torch::jit::Node *node) {
  auto *i0 = node->input(0);
  // erf(i0)
  return createErf(graph, {i0});
}

torch::jit::Node *erfcHandler(torch::jit::Graph *graph,
                              torch::jit::Node *node) {
  auto *x = node->input(0);
  auto *t0 = createErf(graph, {x})->output();
  auto *t1 = createConstantFloatLike(graph, t0, {1.0}, {})->output();
  // sub(1.0, erf(x))
  return createSub(graph, {t1, t0});
}

torch::jit::Node *expHandler(torch::jit::Graph *graph, torch::jit::Node *node) {
  auto *i0 = node->input(0);
  // exp(i0)
  return createExp(graph, {i0});
}

torch::jit::Node *expm1Handler(torch::jit::Graph *graph,
                               torch::jit::Node *node) {
  auto *i0 = node->input(0);
  // expm1(i0)
  return createExpm1(graph, {i0});
}

torch::jit::Node *floorHandler(torch::jit::Graph *graph,
                               torch::jit::Node *node) {
  auto *i0 = node->input(0);
  // floor(i0)
  return createFloor(graph, {i0});
}

torch::jit::Node *fmodHandler(torch::jit::Graph *graph,
                              torch::jit::Node *node) {
  auto *i0 = node->input(0);
  auto *i1 = node->input(1);
  // fmod(i0, i1)
  return createFmod(graph, {i0, i1});
}

torch::jit::Node *geHandler(torch::jit::Graph *graph, torch::jit::Node *node) {
  auto *x = node->input(0);
  auto *y = node->input(1);
  auto *t0 = createGreater(graph, {x, y})->output();
  auto *t1 = createEqual(graph, {x, y})->output();
  // logical_or(greater(x, y), equal(x, y))
  return createLogical_or(graph, {t0, t1});
}

torch::jit::Node *gtHandler(torch::jit::Graph *graph, torch::jit::Node *node) {
  auto *i0 = node->input(0);
  auto *i1 = node->input(1);
  // greater(i0, i1)
  return createGreater(graph, {i0, i1});
}

torch::jit::Node *hardshrinkHandler(torch::jit::Graph *graph,
                                    torch::jit::Node *node) {
  auto *x = node->input(0);
  auto *t0 = createAbs(graph, {x})->output();
  auto *l = node->input(1);
  auto *t1 = createAbs(graph, {l})->output();
  auto *t2 = createGreater(graph, {t0, t1})->output();
  auto *t3 = createConstantFloatLike(graph, x, {0.0}, {})->output();
  // where(greater(abs(x), abs(l)), x, 0.0)
  return createWhere(graph, {t2, x, t3});
}

torch::jit::Node *hardtanhHandler(torch::jit::Graph *graph,
                                  torch::jit::Node *node) {
  auto *x = node->input(0);

  const CreateCast<float> cast_obj;

  auto *t1 = node->input(1);
  auto *t2 = node->input(2);

  auto *a =
      getNodeScalarType(t1) != c10::kFloat ? cast_obj(graph, t1)->output() : t1;
  auto *b =
      getNodeScalarType(t2) != c10::kFloat ? cast_obj(graph, t2)->output() : t2;
  // clip(x, a, b)
  return createClip(graph, {x, a, b});
}

torch::jit::Node *hingeEmbeddingLossHandler(torch::jit::Graph *graph,
                                            torch::jit::Node *node) {
  auto *y = node->input(1);
  auto *t0 = createConstantFloatLike(graph, y, {-1.0}, {})->output();
  auto *t1 = createEqual(graph, {y, t0})->output();
  auto *delta = node->input(2);
  auto *x = node->input(0);
  auto *t2 = createSub(graph, {delta, x})->output();
  auto *t3 = createConstantFloatLike(graph, t2, {0.0}, {})->output();
  auto *t4 = createMax(graph, {t3, t2})->output();
  auto *t5 = createConstantFloatLike(graph, y, {1.0}, {})->output();
  auto *t6 = createEqual(graph, {y, t5})->output();
  auto *t7 = createWhere(graph, {t6, x, t3})->output();
  auto *t8 = createWhere(graph, {t1, t4, t7})->output();
  auto *red = node->input(3);
  auto t9 = constantToLong(red->node());
  auto t10 = convertReduceToPopart(t9);
  // identityloss(where(equal(y, -1.0), max(0.0, sub(delta, x)),
  // where(equal(y, 1.0), x, 0.0)), reduction(clong(red)))
  return createIdentityloss(graph, {t8}, t10);
}

torch::jit::Node *indexSelectHandler(torch::jit::Graph *graph,
                                     torch::jit::Node *node) {
  auto *x = node->input(0);
  auto *i = node->input(2);
  auto *d = node->input(1);
  auto t0 = x->type()->expect<c10::TensorType>();
  auto t1 = handleDimensionParam(d, t0);
  // gather(x, i, dimension(d, TensorType(x)))
  return createGather(graph, {x, i}, t1);
}

torch::jit::Node *isnanHandler(torch::jit::Graph *graph,
                               torch::jit::Node *node) {
  auto *i0 = node->input(0);
  // isnan(i0)
  return createIsnan(graph, {i0});
}

torch::jit::Node *l1LossHandler(torch::jit::Graph *graph,
                                torch::jit::Node *node) {
  auto *x = node->input(0);
  auto *y = node->input(1);
  auto *t0 = createSub(graph, {x, y})->output();
  auto *red = node->input(2);
  auto t1 = constantToLong(red->node());
  auto t2 = convertReduceToPopart(t1);
  auto *t3 = createL1loss(graph, {t0}, 1.0, t2)->output();
  // identityloss(l1loss(sub(x, y), 1.0, reduction(clong(red))), 2)
  return createIdentityloss(graph, {t3}, 2);
}

torch::jit::Node *leHandler(torch::jit::Graph *graph, torch::jit::Node *node) {
  auto *x = node->input(0);
  auto *y = node->input(1);
  auto *t0 = createLess(graph, {x, y})->output();
  auto *t1 = createEqual(graph, {x, y})->output();
  // logical_or(less(x, y), equal(x, y))
  return createLogical_or(graph, {t0, t1});
}

torch::jit::Node *leakyReluHandler(torch::jit::Graph *graph,
                                   torch::jit::Node *node) {
  auto *x = node->input(0);
  auto *y = node->input(1);
  auto t0 = constantToFloat(y->node());
  // leakyrelu(x, cfloat(y))
  return createLeakyrelu(graph, {x}, t0);
}

torch::jit::Node *logHandler(torch::jit::Graph *graph, torch::jit::Node *node) {
  auto *i0 = node->input(0);
  // log(i0)
  return createLog(graph, {i0});
}

torch::jit::Node *log10Handler(torch::jit::Graph *graph,
                               torch::jit::Node *node) {
  auto *x = node->input(0);
  auto *t0 = createLog(graph, {x})->output();
  auto *t1 =
      createConstantFloatLike(graph, t0, {2.302585092994046}, {})->output();
  // div(log(x), 2.302585092994046)
  return createDiv(graph, {t0, t1});
}

torch::jit::Node *log1pHandler(torch::jit::Graph *graph,
                               torch::jit::Node *node) {
  auto *i0 = node->input(0);
  // log1p(i0)
  return createLog1p(graph, {i0});
}

torch::jit::Node *log2Handler(torch::jit::Graph *graph,
                              torch::jit::Node *node) {
  auto *x = node->input(0);
  auto *t0 = createLog(graph, {x})->output();
  auto *t1 =
      createConstantFloatLike(graph, t0, {0.6931471805599453}, {})->output();
  // div(log(x), 0.6931471805599453)
  return createDiv(graph, {t0, t1});
}

torch::jit::Node *logSigmoidHandler(torch::jit::Graph *graph,
                                    torch::jit::Node *node) {
  auto *x = node->input(0);
  auto *t0 = createSigmoid(graph, {x})->output();
  // log(sigmoid(x))
  return createLog(graph, {t0});
}

torch::jit::Node *logicalAndHandler(torch::jit::Graph *graph,
                                    torch::jit::Node *node) {
  auto *i0 = node->input(0);
  auto *i1 = node->input(1);
  // logical_and(i0, i1)
  return createLogical_and(graph, {i0, i1});
}

torch::jit::Node *logicalNotHandler(torch::jit::Graph *graph,
                                    torch::jit::Node *node) {
  auto *i0 = node->input(0);
  // logical_not(i0)
  return createLogical_not(graph, {i0});
}

torch::jit::Node *logicalOrHandler(torch::jit::Graph *graph,
                                   torch::jit::Node *node) {
  auto *i0 = node->input(0);
  auto *i1 = node->input(1);
  // logical_or(i0, i1)
  return createLogical_or(graph, {i0, i1});
}

torch::jit::Node *ltHandler(torch::jit::Graph *graph, torch::jit::Node *node) {
  auto *i0 = node->input(0);
  auto *i1 = node->input(1);
  // less(i0, i1)
  return createLess(graph, {i0, i1});
}

torch::jit::Node *marginRankingLossHandler(torch::jit::Graph *graph,
                                           torch::jit::Node *node) {
  auto *y = node->input(2);
  auto *t0 = createNeg(graph, {y})->output();
  auto *x1 = node->input(0);
  auto *x2 = node->input(1);
  auto *t1 = createSub(graph, {x1, x2})->output();
  auto *t2 = createMul(graph, {t0, t1})->output();
  auto *margin = node->input(3);
  auto *t3 = createAdd(graph, {t2, margin})->output();
  auto *t4 = createConstantFloatLike(graph, t3, {0.0}, {})->output();
  auto *t5 = createMax(graph, {t3, t4})->output();
  auto *red = node->input(4);
  auto t6 = constantToLong(red->node());
  auto t7 = convertReduceToPopart(t6);
  // identityloss(max(add(mul(neg(y), sub(x1, x2)), margin), 0.0),
  // reduction(clong(red)))
  return createIdentityloss(graph, {t5}, t7);
}

torch::jit::Node *maskedFillHandler(torch::jit::Graph *graph,
                                    torch::jit::Node *node) {
  auto *i1 = node->input(1);
  auto *i2 = node->input(2);
  auto *i0 = node->input(0);
  // where(i1, i2, i0)
  return createWhere(graph, {i1, i2, i0});
}

torch::jit::Node *mseLossHandler(torch::jit::Graph *graph,
                                 torch::jit::Node *node) {
  auto *x = node->input(0);
  auto *y = node->input(1);
  auto *t0 = createSub(graph, {x, y})->output();
  auto *t1 = createMul(graph, {t0, t0})->output();
  auto *red = node->input(2);
  auto t2 = constantToLong(red->node());
  auto t3 = convertReduceToPopart(t2);
  // identityloss(mul(sub(x, y), sub(x, y)), reduction(clong(red)))
  return createIdentityloss(graph, {t1}, t3);
}

torch::jit::Node *neHandler(torch::jit::Graph *graph, torch::jit::Node *node) {
  auto *x = node->input(0);
  auto *y = node->input(1);
  auto *t0 = createEqual(graph, {x, y})->output();
  // logical_not(equal(x, y))
  return createLogical_not(graph, {t0});
}

torch::jit::Node *negHandler(torch::jit::Graph *graph, torch::jit::Node *node) {
  auto *i0 = node->input(0);
  // neg(i0)
  return createNeg(graph, {i0});
}

torch::jit::Node *normalInPlaceHandler(torch::jit::Graph *graph,
                                       torch::jit::Node *node) {
  auto *x = node->input(0);
  auto t0 = shapeFromTensor(x);
  auto *c1 = node->input(1);
  auto t1 = constantToFloat(c1->node());
  auto *c2 = node->input(2);
  auto t2 = constantToFloat(c2->node());
  // randomNormal(x, tensor_shape(x), cfloat(c1), cfloat(c2))
  return createRandomNormal(graph, {x}, t0, t1, t2);
}

torch::jit::Node *pixelShuffleHandler(torch::jit::Graph *graph,
                                      torch::jit::Node *node) {
  auto *x = node->input(0);
  auto *y = node->input(1);
  auto t0 = constantToLong(y->node());
  // depthtospace(x, clong(y), "CRD")
  return createDepthtospace(graph, {x}, t0, "CRD");
}

torch::jit::Node *powHandler(torch::jit::Graph *graph, torch::jit::Node *node) {
  auto *i0 = node->input(0);
  auto *i1 = node->input(1);
  // pow(i0, i1)
  return createPow(graph, {i0, i1});
}

torch::jit::Node *randHandler(torch::jit::Graph *graph,
                              torch::jit::Node *node) {
  auto *x = node->input(0);
  auto *t0 = node->output(0);
  auto t2 = shapeFromTensor(t0);
  auto t3 = getNodeScalarType(t0);
  // randomUniform(x, tensor_shape(output0), 1.0, 0.0, scalar_type(output0))
  return createRandomUniform(graph, x, t2, 1.0, 0.0, t3);
}

torch::jit::Node *randnHandler(torch::jit::Graph *graph,
                               torch::jit::Node *node) {
  auto *t0 = node->output(0);
  auto t2 = shapeFromTensor(t0);
  auto t3 = getNodeScalarType(t0);
  // randomNormal({}, tensor_shape(output0), 0.0, 1.0, scalar_type(output0))
  return createRandomNormal(graph, {}, t2, 0.0, 1.0, t3);
}

torch::jit::Node *reciprocalHandler(torch::jit::Graph *graph,
                                    torch::jit::Node *node) {
  auto *i0 = node->input(0);
  // reciprocal(i0)

  if (getNodeScalarType(i0) == c10::kInt) {
    i0 = createCast(graph, i0, c10::kFloat)->output();
  }

  return createReciprocal(graph, {i0});
}

torch::jit::Node *reflectionPad1dHandler(torch::jit::Graph *graph,
                                         torch::jit::Node *node) {
  auto *x = node->input(0);
  auto *y = node->input(1);
  auto t0 = constantToLongVec(y->node());
  // reflectionPad(x, clong_list(y))
  return createReflectionPad(graph, x, t0);
}

torch::jit::Node *reluHandler(torch::jit::Graph *graph,
                              torch::jit::Node *node) {
  auto *i0 = node->input(0);
  // relu(i0)
  return createRelu(graph, {i0});
}

torch::jit::Node *remainderHandler(torch::jit::Graph *graph,
                                   torch::jit::Node *node) {
  auto *i0 = node->input(0);
  auto *i1 = node->input(1);
  // remainder(i0, i1)
  return createRemainder(graph, {i0, i1});
}

torch::jit::Node *replicationPad1dHandler(torch::jit::Graph *graph,
                                          torch::jit::Node *node) {
  auto *x = node->input(0);
  auto *y = node->input(1);
  auto t0 = constantToLongVec(y->node());
  // edgePad(x, clong_list(y))
  return createEdgePad(graph, x, t0);
}

torch::jit::Node *roundHandler(torch::jit::Graph *graph,
                               torch::jit::Node *node) {
  auto *i0 = node->input(0);
  // round(i0)
  return createNearbyInt(graph, {i0});
}

torch::jit::Node *rsqrtHandler(torch::jit::Graph *graph,
                               torch::jit::Node *node) {
  auto *x = node->input(0);
  auto *t0 = createSqrt(graph, {x})->output();
  // matched reciprocal: div(1.0, x)
  // reciprocal(sqrt(x))
  return createReciprocal(graph, {t0});
}

torch::jit::Node *rsubHandler(torch::jit::Graph *graph,
                              torch::jit::Node *node) {
  auto *y = node->input(1);
  auto *x = node->input(0);
  // sub(y, x)
  return createSub(graph, {y, x});
}

torch::jit::Node *seluHandler(torch::jit::Graph *graph,
                              torch::jit::Node *node) {
  auto *x = node->input(0);
  // selu(x, 1.6732632423543772, 1.0507009873554805)
  return createSelu(graph, {x}, 1.6732632423543772, 1.0507009873554805);
}

torch::jit::Node *sigmoidHandler(torch::jit::Graph *graph,
                                 torch::jit::Node *node) {
  auto *i0 = node->input(0);
  // sigmoid(i0)
  return createSigmoid(graph, {i0});
}

torch::jit::Node *signHandler(torch::jit::Graph *graph,
                              torch::jit::Node *node) {
  auto *i0 = node->input(0);
  // sign(i0)
  return createSign(graph, {i0});
}

torch::jit::Node *siluHandler(torch::jit::Graph *graph,
                              torch::jit::Node *node) {
  auto *i0 = node->input(0);
  // swish(i0)
  return createSwish(graph, {i0});
}

torch::jit::Node *sinHandler(torch::jit::Graph *graph, torch::jit::Node *node) {
  auto *i0 = node->input(0);
  // sin(i0)
  return createSin(graph, {i0});
}

torch::jit::Node *sinhHandler(torch::jit::Graph *graph,
                              torch::jit::Node *node) {
  auto *i0 = node->input(0);
  // sinh(i0)
  return createSinh(graph, {i0});
}

torch::jit::Node *smoothL1LossHandler(torch::jit::Graph *graph,
                                      torch::jit::Node *node) {
  auto *beta = node->input(3);
  auto *x = node->input(0);
  auto *y = node->input(1);
  auto *t0 = createSub(graph, {x, y})->output();
  auto *t1 = createAbs(graph, {t0})->output();
  auto *t2 = createGreater(graph, {beta, t1})->output();
  auto *t3 = createConstantFloatLike(graph, t1, {0.5}, {})->output();
  auto *t4 = createMul(graph, {t3, t1})->output();
  auto *t5 = createMul(graph, {t4, t1})->output();
  auto *t6 = createDiv(graph, {t5, beta})->output();
  auto *t7 = createMul(graph, {t3, beta})->output();
  auto *t8 = createSub(graph, {t1, t7})->output();
  auto *t9 = createWhere(graph, {t2, t6, t8})->output();
  auto *red = node->input(2);
  auto t10 = constantToLong(red->node());
  auto t11 = convertReduceToPopart(t10);
  // identityloss(where(greater(beta, abs(sub(x, y))), div(mul(mul(0.5,
  // abs(sub(x, y))), abs(sub(x, y))), beta), sub(abs(sub(x, y)), mul(0.5,
  // beta))), reduction(clong(red)))
  return createIdentityloss(graph, {t9}, t11);
}

torch::jit::Node *softMarginLossHandler(torch::jit::Graph *graph,
                                        torch::jit::Node *node) {
  auto *y = node->input(1);
  auto *t0 = createNeg(graph, {y})->output();
  auto *x = node->input(0);
  auto *t1 = createMul(graph, {t0, x})->output();
  auto *t2 = createExp(graph, {t1})->output();
  // matched log1p: log(add(1.0, x))
  auto *t3 = createLog1p(graph, {t2})->output();
  auto *red = node->input(2);
  auto t4 = constantToLong(red->node());
  auto t5 = convertReduceToPopart(t4);
  // identityloss(log1p(exp(mul(neg(y), x))), reduction(clong(red)))
  return createIdentityloss(graph, {t3}, t5);
}

torch::jit::Node *softshrinkHandler(torch::jit::Graph *graph,
                                    torch::jit::Node *node) {
  auto *x = node->input(0);
  auto *l = node->input(1);
  auto *t0 = createNeg(graph, {l})->output();
  auto *t1 = createLess(graph, {x, t0})->output();
  auto *t2 = createAdd(graph, {x, l})->output();
  auto *t3 = createGreater(graph, {x, l})->output();
  auto *t4 = createSub(graph, {x, l})->output();
  auto *t5 = createConstantFloatLike(graph, t4, {0.0}, {})->output();
  auto *t6 = createWhere(graph, {t3, t4, t5})->output();
  // where(less(x, neg(l)), add(x, l), where(greater(x, l), sub(x, l), 0.0))
  return createWhere(graph, {t1, t2, t6});
}

torch::jit::Node *sqrtHandler(torch::jit::Graph *graph,
                              torch::jit::Node *node) {
  auto *i0 = node->input(0);
  // sqrt(i0)
  return createSqrt(graph, {i0});
}

torch::jit::Node *squareHandler(torch::jit::Graph *graph,
                                torch::jit::Node *node) {
  auto *x = node->input(0);
  // mul(x, x)
  return createMul(graph, {x, x});
}

torch::jit::Node *subHandler(torch::jit::Graph *graph, torch::jit::Node *node) {
  auto *x = node->input(0);
  auto *y = node->input(1);
  auto *a = node->input(2);
  auto *t0 = createMul(graph, {y, a})->output();
  auto *t1 = hasUnityValue(a) ? y : t0;
  // sub(x, alpha(y, a, mul(y, a)))
  return createSub(graph, {x, t1});
}

torch::jit::Node *tHandler(torch::jit::Graph *graph, torch::jit::Node *node) {
  auto *i0 = node->input(0);
  // transpose(i0, {})
  return createTranspose(graph, {i0}, {});
}

torch::jit::Node *tanHandler(torch::jit::Graph *graph, torch::jit::Node *node) {
  auto *i0 = node->input(0);
  // tan(i0)
  return createTan(graph, {i0});
}

torch::jit::Node *tanhHandler(torch::jit::Graph *graph,
                              torch::jit::Node *node) {
  auto *i0 = node->input(0);
  // tanh(i0)
  return createTanh(graph, {i0});
}

torch::jit::Node *thresholdHandler(torch::jit::Graph *graph,
                                   torch::jit::Node *node) {
  auto *x = node->input(0);
  auto *threshold = node->input(1);
  auto *t0 = createGreater(graph, {x, threshold})->output();
  auto *val = node->input(2);
  // where(greater(x, threshold), x, val)
  return createWhere(graph, {t0, x, val});
}

torch::jit::Node *topkHandler(torch::jit::Graph *graph,
                              torch::jit::Node *node) {
  auto *x = node->input(0);
  auto *c = node->input(1);
  auto *t0 = c->node();
  setNodeTensorAttrValue(t0,
                         getNodeTensorAttrValue(t0).to(at::ScalarType::Long));
  t0->output()->inferTypeFrom(getNodeTensorAttrValue(t0));
  auto *t1 = t0->output();
  auto *l = node->input(2);
  auto t2 = x->type()->expect<c10::TensorType>();
  auto t3 = handleDimensionParam(l, t2);

  const bool largest = constantToBool(node->input(3)->node());
  const bool sorted = constantToBool(node->input(4)->node());

  // topk(x, inplace_cast<long>(c), dimension(l, TensorType(x)))
  return createTopk(graph, {x, t1}, t3, largest, sorted);
}

torch::jit::Node *sortHandler(torch::jit::Graph *graph,
                              torch::jit::Node *node) {
  auto *const input = node->input(0);
  auto *const dim = node->input(2);
  const int64_t axis =
      handleDimensionParam(dim, input->type()->expect<c10::TensorType>());

  const bool descending = constantToBool(node->input(3)->node());
  const bool stable = constantToBool(node->input(1)->node());

  return createSort(graph, {input}, axis, descending, stable);
}

torch::jit::Node *uniformInPlaceHandler(torch::jit::Graph *graph,
                                        torch::jit::Node *node) {
  auto *x = node->input(0);
  auto t0 = shapeFromTensor(x);
  auto *b = node->input(2);
  auto t1 = constantToFloat(b->node());
  auto *a = node->input(1);
  auto t2 = constantToFloat(a->node());
  // randomUniform(x, tensor_shape(x), cfloat(b), cfloat(a))
  return createRandomUniform(graph, x, t0, t1, t2);
}

torch::jit::Node *whereHandler(torch::jit::Graph *graph,
                               torch::jit::Node *node) {
  auto *i0 = node->input(0);
  auto *i1 = node->input(1);
  auto *i2 = node->input(2);
  // where(i0, i1, i2)
  return createWhere(graph, {i0, i1, i2});
}

} // namespace

__attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() {
  registerHandler(c10::aten::abs, absHandler);
  registerHandler(c10::aten::acos, acosHandler);
  registerHandler(c10::aten::acosh, acoshHandler);
  registerHandler(c10::aten::addmm, addmmHandler);
  registerHandler(c10::aten::asin, asinHandler);
  registerHandler(c10::aten::asinh, asinhHandler);
  registerHandler(c10::aten::atan, atanHandler);
  registerHandler(c10::aten::atan2, atan2Handler);
  registerHandler(c10::aten::atanh, atanhHandler);
  registerHandler(c10::aten::cat, catHandler);
  registerHandler(c10::aten::ceil, ceilHandler);
  registerHandler(c10::aten::celu, celuHandler);
  registerHandler(c10::aten::constant_pad_nd, constantPadNdHandler);
  registerHandler(c10::aten::cos, cosHandler);
  registerHandler(c10::aten::cosh, coshHandler);
  registerHandler(c10::aten::detach, detachHandler);
  registerHandler(c10::aten::div, divHandler);
  registerHandler(c10::aten::elu, eluHandler);
  registerHandler(c10::aten::eq, eqHandler);
  registerHandler(c10::aten::erf, erfHandler);
  registerHandler(c10::aten::erfc, erfcHandler);
  registerHandler(c10::aten::exp, expHandler);
  registerHandler(c10::aten::expm1, expm1Handler);
  registerHandler(c10::aten::floor, floorHandler);
  registerHandler(c10::aten::fmod, fmodHandler);
  registerHandler(c10::aten::ge, geHandler);
  registerHandler(c10::aten::gt, gtHandler);
  registerHandler(c10::aten::hardshrink, hardshrinkHandler);
  registerHandler(c10::aten::hardtanh, hardtanhHandler);
  registerHandler(c10::aten::hinge_embedding_loss, hingeEmbeddingLossHandler);
  registerHandler(c10::aten::index_select, indexSelectHandler);
  registerHandler(c10::aten::isnan, isnanHandler);
  registerHandler(c10::aten::l1_loss, l1LossHandler);
  registerHandler(c10::aten::le, leHandler);
  registerHandler(c10::aten::leaky_relu, leakyReluHandler);
  registerHandler(c10::aten::log, logHandler);
  registerHandler(c10::aten::log10, log10Handler);
  registerHandler(c10::aten::log1p, log1pHandler);
  registerHandler(c10::aten::log2, log2Handler);
  registerHandler(c10::aten::log_sigmoid, logSigmoidHandler);
  registerHandler(c10::aten::log_sigmoid_forward, logSigmoidHandler);
  registerHandler(c10::aten::logical_and, logicalAndHandler);
  registerHandler(c10::aten::logical_not, logicalNotHandler);
  registerHandler(c10::aten::logical_or, logicalOrHandler);
  registerHandler(c10::aten::lt, ltHandler);
  registerHandler(c10::aten::margin_ranking_loss, marginRankingLossHandler);
  registerHandler(c10::aten::masked_fill, maskedFillHandler);
  registerHandler(c10::aten::mse_loss, mseLossHandler);
  registerHandler(c10::aten::ne, neHandler);
  registerHandler(c10::aten::neg, negHandler);
  registerHandler(c10::aten::normal_, normalInPlaceHandler);
  registerHandler(c10::aten::pixel_shuffle, pixelShuffleHandler);
  registerHandler(c10::aten::pow, powHandler);
  registerHandler(c10::aten::rand, randHandler);
  registerHandler(c10::aten::randn, randnHandler);
  registerHandler(c10::aten::reciprocal, reciprocalHandler);
  registerHandler(c10::aten::reflection_pad1d, reflectionPad1dHandler);
  registerHandler(c10::aten::reflection_pad2d, reflectionPad1dHandler);
  registerHandler(c10::aten::relu, reluHandler);
  registerHandler(c10::aten::remainder, remainderHandler);
  registerHandler(c10::aten::replication_pad1d, replicationPad1dHandler);
  registerHandler(c10::aten::replication_pad2d, replicationPad1dHandler);
  registerHandler(c10::aten::replication_pad3d, replicationPad1dHandler);
  registerHandler(c10::aten::round, roundHandler);
  registerHandler(c10::aten::rsqrt, rsqrtHandler);
  registerHandler(c10::aten::rsub, rsubHandler);
  registerHandler(c10::aten::selu, seluHandler);
  registerHandler(c10::aten::sigmoid, sigmoidHandler);
  registerHandler(c10::aten::sign, signHandler);
  registerHandler(c10::aten::silu, siluHandler);
  registerHandler(c10::aten::sin, sinHandler);
  registerHandler(c10::aten::sinh, sinhHandler);
  registerHandler(c10::aten::smooth_l1_loss, smoothL1LossHandler);
  registerHandler(c10::aten::soft_margin_loss, softMarginLossHandler);
  registerHandler(c10::aten::softshrink, softshrinkHandler);
  registerHandler(c10::aten::sort, sortHandler);
  registerHandler(c10::aten::sqrt, sqrtHandler);
  registerHandler(c10::aten::square, squareHandler);
  registerHandler(c10::aten::sub, subHandler);
  registerHandler(c10::aten::t, tHandler);
  registerHandler(c10::aten::tan, tanHandler);
  registerHandler(c10::aten::tanh, tanhHandler);
  registerHandler(c10::aten::threshold, thresholdHandler);
  registerHandler(c10::aten::topk, topkHandler);
  registerHandler(c10::aten::uniform_, uniformInPlaceHandler);
  registerHandler(c10::aten::where, whereHandler);
}

} // namespace poptorch


================================================
FILE: poptorch/source/popart_canonicalization/BilinearOps.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#include "../PoptorchStaticInit.hpp"
#include "PopartCanonicalizationUtils.hpp"

#include <poptorch/OpBuilder.hpp>
#include <poptorch/Utils.hpp>
#include <poptorch_logging/Error.hpp>
#include <poptorch_logging/Logging.hpp>

namespace poptorch {
namespace {

torch::jit::Node *bilinearHandler(torch::jit::Graph *graph,
                                  torch::jit::Node *node) {
  // aten::bilinear(Tensor input1, Tensor input2, Tensor weight, Tensor? bias)
  // -> Tensor

  // Bilinear - outputs a linear combination of feature inputs:
  //
  //     Ynm = \sum_ij Un_i Am_ij Vn_j + bm
  //
  // Where U and V are the data input tensors containing feature vectors
  // (possibly ND), A is the 3D weight tensor, and b is the bias vector.
  // We can evaluate the bilinear map in pytorch as follows:
  //
  //     U = U.unsqueeze(-2).unsqueeze(-2)
  //     V = V.unsqueeze(-2).unsqueeze(-1)
  //     Y = U.matmul(A).matmul(V)
  //     Y = Y.squeeze(-1).squeeze(-1)
  //     Y = Y + b

  // Tensor feature inputs
  torch::jit::Value *in1 = node->input(0);
  torch::jit::Value *in2 = node->input(1);

  // weight and the optional bias
  torch::jit::Value *weight = node->input(2);
  torch::jit::Value *bias = node->input(3);

  // Insert singleton dimensions in feature inputs
  auto shape1 = shapeFromTensor(in1);
  shape1.insert(shape1.end() - 1, 1);
  shape1.insert(shape1.end() - 1, 1);
  torch::jit::Node *flat_in1 = createReshape(graph, in1, shape1);

  auto shape2 = shapeFromTensor(in2);
  shape2.insert(shape2.end() - 1, 1);
  shape2.insert(shape2.end(), 1);
  torch::jit::Node *flat_in2 = createReshape(graph, in2, shape2);

  // Multiply matrices together for the bilinear map: U * A * V as above
  torch::jit::Node *in1_matmul_weight =
      poptorch::createMatmul(graph, {flat_in1->output(), weight});

  torch::jit::Node *bilinear_map = poptorch::createMatmul(
      graph, {in1_matmul_weight->output(), flat_in2->output()});

  // Squeeze out the trailing singleton dims by reshaping to the expected
  // result size. Taking care to omit the singleton dims injected above, we
  // derive the output shape from the leading dimensions of input1 and the
  // size in the first dimension of the weight tensor.  In pytorch:
  //
  //    U.shape[0:-1] + (A.shape[0],)
  //
  // is the expected output size.
  std::vector<std::int64_t> result_shape(shape1.begin(), shape1.end() - 3);
  auto weight_shape = shapeFromTensor(weight);
  result_shape.push_back(weight_shape.front());
  torch::jit::Node *result =
      createReshape(graph, bilinear_map->output(), result_shape);

  // Add optional bias
  if (!isNone(bias->node())) {
    result = poptorch::createAdd(graph, {result->output(), bias});
  }

  return result;
}
} // namespace

__attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() {
  registerHandler(c10::aten::bilinear, bilinearHandler);
}

} // namespace poptorch


================================================
FILE: poptorch/source/popart_canonicalization/BitwiseOps.cpp
================================================
// Copyright (c) 2021 Graphcore Ltd. All rights reserved.
#include "../PoptorchStaticInit.hpp"
#include "PopartCanonicalizationUtils.hpp"

#include "../PoptorchSymbols.hpp"
#include "poptorch/OpBuilder.hpp"
#include "poptorch_logging/Error.hpp"

namespace poptorch {
namespace {

// PyTorch's bitwise_* functions can take any integral tensors as input (ie.
// torch.{uint8,int8,int16,int32,int64}. However, Poplibs' element-wise binary
// ops don't support 8-bit int inputs (see
// popops/codelets/elementwiseBinaryCodelets.cpp). Use this extra function to
// generate slightly nicer error messages.
void verifyCompatibleIntegralInputs(torch::jit::Node *node,
                                    const std::string &op_name) {
  ERROR_ON_MSG(allInputsOfType(node, at::ScalarType::Byte) ||
                   allInputsOfType(node, at::ScalarType::Char),
               op_name + ": Poplar does not support binary operations on "
                         "8-bit integral types.");
}

torch::jit::Node *bitwiseAndHandler(torch::jit::Graph *graph,
                                    torch::jit::Node *node) {
  if (allInputsBool(node)) {
    return createLogical_and(graph, {node->input(0), node->input(1)});
  }
  if (allInputsInteger(node)) {
    verifyCompatibleIntegralInputs(node, "Bitwise-and");
    return createBitwiseand(graph, {node->input(0), node->input(1)});
  }
  ERROR("Bitwise-and operator supports only bool and integer types");
  return nullptr;
}

torch::jit::Node *bitwiseNotHandler(torch::jit::Graph *graph,
                                    torch::jit::Node *node) {
  if (allInputsBool(node)) {
    return createLogical_not(graph, {node->input(0)});
  }
  if (allInputsInteger(node)) {
    verifyCompatibleIntegralInputs(node, "Bitwise-not");
    return createBitwisenot(graph, {node->input(0)});
  }
  ERROR("Bitwise-not operator supports only bool and integer types");
  return nullptr;
}

torch::jit::Node *bitwiseOrHandler(torch::jit::Graph *graph,
                                   torch::jit::Node *node) {
  if (allInputsBool(node)) {
    return createLogical_or(graph, {node->input(0), node->input(1)});
  }
  if (allInputsInteger(node)) {
    verifyCompatibleIntegralInputs(node, "Bitwise-or");
    return createBitwiseor(graph, {node->input(0), node->input(1)});
  }
  ERROR("Bitwise-or operator supports only bool and integer types");
  return nullptr;
}

torch::jit::Node *bitwiseXorHandler(torch::jit::Graph *graph,
                                    torch::jit::Node *node) {
  if (allInputsBool(node)) {
    return createLogical_xor(graph, {node->input(0), node->input(1)});
  }
  if (allInputsInteger(node)) {
    verifyCompatibleIntegralInputs(node, "Bitwise-xor");
    return createBitwisexor(graph, {node->input(0), node->input(1)});
  }
  ERROR("Bitwise-xor operator supports only bool and integer types");
  return nullptr;
}
} // namespace

__attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() {
  registerHandler(c10::aten::bitwise_and, bitwiseAndHandler);
  registerHandler(c10::aten::bitwise_not, bitwiseNotHandler);
  registerHandler(c10::aten::bitwise_or, bitwiseOrHandler);
  registerHandler(c10::aten::bitwise_xor, bitwiseXorHandler);
  registerHandler(c10::aten::__and__, bitwiseAndHandler);
  registerHandler(c10::aten::__or__, bitwiseOrHandler);
  registerHandler(c10::aten::__xor__, bitwiseXorHandler);
}
} // namespace poptorch


================================================
FILE: poptorch/source/popart_canonicalization/BlasOps.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#include "../PoptorchStaticInit.hpp"
#include "PopartCanonicalizationUtils.hpp"

#include "poptorch/OpBuilder.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

#include "../PoptorchSymbols.hpp"

namespace poptorch {
namespace {

torch::jit::Node *matmulHandler(torch::jit::Graph *graph,
                                torch::jit::Node *node) {
  // aten::mm(Tensor self, Tensor mat2) -> (Tensor)
  // "aten::matmul(Tensor self, int dim) -> Tensor"
  // We will fuse the batch dimesion of the matrix A of matmul(A, B),
  // if we find such a pattern:
  //
  //    matrix A(N, M, K) multiplies matrix B(K, L)
  //
  // where matrix A is matmul's input 0, and matrix B is its input 1.
  // The matrix A will be reshaped into A(N*M, K) before matmul,
  // The benefit of this transformation is to avoid the ReduceSum
  // of the backwrad pass, as ReduceSum is a performance bottleneck otherwise.
  //
  // The input IR before canonicalization:
  // %output : Float(3:14, 2:7, 7:1) = aten::matmul(%input.1, %27)
  // It takes 3 steps for the transformation:
  // 1. Reshape
  // 2. Matmul
  // 3. Reshape
  // The output IRs after canonicalization:
  // %28 : Float(6:7, 7:1) =
  //       popart::reshape_static_shape[shape=[6, 7]](%input.1)
  // %29 : FloatTensor = popart::matmul(%28, %27)
  // %30 : Float(3:14, 2:7, 7:1) =
  //       popart::reshape_static_shape[shape=[3, 2, 7]](%29)

  torch::jit::Value *matrix_a = node->input(0);
  torch::jit::Value *matrix_b = node->input(1);

  std::vector<std::int64_t> shape_input_a = shapeFromTensor(matrix_a);
  std::vector<std::int64_t> shape_input_b = shapeFromTensor(matrix_b);
  std::int64_t const size_a = shape_input_a.size();
  std::int64_t const size_b = shape_input_b.size();

  torch::jit::Node *result;
  // Matrix A can have any batch dimensions
  // But matrix B has to be in a 2D shape
  if (size_a >= 3 && size_b == 2 &&
      shape_input_a[size_a - 1] == shape_input_b[0]) {
    // Prepare the output shape of matmul by
    //   - merging all the batch dimensions of matrix A, and
    //   - taking the last dimension of matrix B
    std::vector<std::int64_t> output_shape;
    // Prepare the shape of fused batch dimensions for matrix A
    std::vector<std::int64_t> fused_a_shape;

    std::int64_t merged_dim = shape_input_a[size_a - 2];
    for (std::int64_t i = 0; i < size_a - 2; ++i) {
      // Final output shape could have any batch dimensions as before
      output_shape.push_back(shape_input_a[i]);
      merged_dim *= shape_input_a[i];
    }
    output_shape.push_back(shape_input_a[size_a - 2]);
    output_shape.push_back(shape_input_b[size_b - 1]);
    // Matrix A has 2D shape after fusing batch dimensions
    fused_a_shape.push_back(merged_dim);
    fused_a_shape.push_back(shape_input_a[size_a - 1]);

    // 1. Reshape matrix A to merge all of its batch size dimensions
    torch::jit::Node *merge_mat = createReshape(graph, matrix_a, fused_a_shape);
    // 2. Matmul
    torch::jit::Node *mul =
        createMatmul(graph, {merge_mat->output(), matrix_b});
    // 3. Reshape to the expected shape of the original matmul
    result = createReshape(graph, mul->output(), output_shape);
    // Add the trace to ease debugging for before and after IRs
    logging::trace("Replacing matmul {} with {} {} {}", *node, *merge_mat, *mul,
                   *result);

  } else {
    // The "normal" matmul will follow the original path
    result = createMatmul(graph, {matrix_a, matrix_b});
  }
  return result;
}

torch::jit::Node *baddbmmHandler(torch::jit::Graph *graph,
                                 torch::jit::Node *node) {
  auto *batch1 = node->input(1);
  auto *batch2 = node->input(2);
  auto b2_dtype = *batch2->type()->expect<c10::TensorType>()->scalarType();
  auto *t0 = createMatmul(graph, {batch1, batch2})->output();
  auto *alpha = node->input(4);
  auto *t1 = createMul(graph, {t0, alpha})->output();
  auto *input = node->input(0);
  auto input_dtype = *input->type()->expect<c10::TensorType>()->scalarType();
  auto *beta = node->input(3);
  // PyTorch type inference dictates that the output scalar type is that of
  // the second batch input, so cast the first input if necessary
  auto *t2 = createMul(graph, {input, beta})->output();
  if (b2_dtype != input_dtype) {
    t2 = createCast(graph, t2, b2_dtype)->output();
  }
  // add(mul(matmul(batch1, batch2), alpha), mul(input, beta))
  return createAdd(graph, {t1, t2});
}

torch::jit::Node *addmvHandler(torch::jit::Graph *graph,
                               torch::jit::Node *node) {
  auto *input = node->input(0);
  auto *mat = node->input(1);
  auto *vec = node->input(2);
  auto *beta = node->input(3);
  auto *alpha = node->input(4);

  const auto alpha_val = constantToFloat(alpha->node());
  const auto beta_val = constantToFloat(beta->node());

  if (alpha_val == 0 && beta_val == 0) {
    return createConstantFloatLike(graph, input, {0}, {shapeFromTensor(input)});
  }

  torch::jit::Node *t1 = nullptr;
  if (alpha_val != 0) {
    auto *t0 = createMatmul(graph, {mat, vec})->output();
    t1 = createMul(graph, {t0, alpha});
  }

  torch::jit::Node *output;
  if (beta_val != 0) {
    auto *t2 = createMul(graph, {input, beta});
    if (t1 != nullptr) {
      output = createAdd(graph, {t1->output(), t2->output()});
    } else {
      output = t2;
    }
  } else {
    output = t1;
  }

  return output;
}
} // namespace

__attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() {
  registerHandler(c10::aten::matmul, matmulHandler);
  registerHandler(c10::aten::baddbmm, baddbmmHandler);
  registerHandler(c10::aten::addmv, addmvHandler);

  // Matrix-Vector
  registerHandler(c10::aten::mv, matmulHandler);

  // Vector-Vector
  registerHandler(c10::aten::dot, matmulHandler);

  // With bias.
  registerHandler(c10::aten::bmm, matmulHandler);

  // No bias.
  registerHandler(c10::aten::mm, matmulHandler);
}

} // namespace poptorch


================================================
FILE: poptorch/source/popart_canonicalization/ConstantOps.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#include <algorithm>
#include <random>

#include "../PoptorchStaticInit.hpp"
#include "PopartCanonicalizationUtils.hpp"

#include "poptorch/OpBuilder.hpp"
#include "poptorch/Utils.hpp"

#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

#include "../PoptorchSymbols.hpp"

namespace poptorch {
namespace {

torch::jit::Node *onesZerosHandler(torch::jit::Graph *graph,
                                   torch::jit::Node *node) {
  // aten::ones(int[] size, *, int? dtype, int? layout, Device? device, bool?
  //            pin_memory) -> Tensor
  // aten::zeros(int[] size, *, int? dtype, int? layout, Device? device, bool?
  //             pin_memory) -> Tensor
  // aten::zeros_like(Tensor self, ScalarType? dtype, Layout? layout, Device?
  //                  device, bool? pin_memory, MemoryFormat? memory_format)
  //                  -> Tensor
  // aten::ones_like(Tensor self, ScalarType? dtype, Layout? layout, Device?
  //                 device, bool? pin_memory, MemoryFormat? memory_format)
  //                 -> Tensor

  torch::jit::Symbol kind = node->kind();
  const bool is_ones = kind == c10::aten::ones ||
                       kind == c10::aten::ones_like ||
                       kind == c10::aten::new_ones;

  auto *output = node->output();
  auto *new_node = createAndInsertNode(
      graph, is_ones ? symbols::poptorch::ones : symbols::poptorch::zeros, {},
      ImplicitCast::None, OutputType::AsDtype, 1, getNodeScalarType(output));

  if (kind != c10::aten::new_ones && kind != c10::aten::new_zeros) {
    new_node->is_(c10::attr::shape, shapeFromTensor(output));
  } else {
    const auto shape_list = handleTensorList(node->input(1)->node());
    std::vector<int64_t> shape;
    for (auto *size : shape_list) {
      ERROR_ON_MSG(
          !isTensorConstant(size->node()),
          "Invalid shape for "
          "new_zeros or new_ones. Shape needs to be a static constant");
      shape.emplace_back(constantToInt(size->node()));
    }
    new_node->is_(c10::attr::shape, shape);
  }
  return new_node;
}

torch::jit::Node *arangeHandler(torch::jit::Graph *graph,
                                torch::jit::Node *node) {
  std::size_t start;
  std::size_t end;
  std::size_t step;

  switch (node->inputs().size()) {
  // arange.out(Scalar end, *, Tensor(a!) out) -> Tensor(a!)
  // arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None,
  //        Device? device=None, bool? pin_memory=None) -> Tensor
  case 2:
  case 5:
    start = 0;
    end = constantToLong(node->input(0)->node());
    step = 1;
    break;
  // arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None,
  //              Layout? layout=None, Device? device=None,
  //              bool? pin_memory=None) -> Tensor
  case 6:
    start = constantToLong(node->input(0)->node());
    end = constantToLong(node->input(1)->node());
    step = 1;
    break;
  // arange.start_out(Scalar start, Scalar end, Scalar step=1, *,
  //                  Tensor(a!) out) -> Tensor(a!)
  // arange.start_step(Scalar start, Scalar end, Scalar step, *,
  //                   ScalarType? dtype=None, Layout? layout=None,
  //                   Device? device=None, bool? pin_memory=None) -> Tensor
  case 4:
  case 7:
    start = constantToLong(node->input(0)->node());
    end = constantToLong(node->input(1)->node());
    step = constantToLong(node->input(2)->node());
    break;
  default:
    ERROR("Unsupported arange op");
    break;
  }

  std::vector<std::int64_t> vals((end - start) / step);
  size_t v = start;
  std::generate(std::begin(vals), std::end(vals), [&v, step] {
    auto cv = v;
    v += step;
    return cv;
  });

  return createConstantInt(graph, vals,
                           {static_cast<std::int64_t>(vals.size())});
}

torch::jit::Node *randpermHandler(torch::jit::Graph *graph,
                                  torch::jit::Node *node) {
  // aten::randperm(Scalar n, ScalarType dtype, Layout, Device, bool pin_memory)
  auto *n = node->input(0)->node();
  setNodeTensorAttrValue(n, getNodeTensorAttrValue(n).to(at::ScalarType::Long));
  n->output()->inferTypeFrom(getNodeTensorAttrValue(n));
  auto *size_of_permutation = n->output();

  const std::vector<int64_t> shape = {constantToLong(n)};
  const auto dtype = c10::ScalarType::Float;

  torch::jit::Value *uniform =
      createRandomUniform(graph, nullptr, shape, 1.0, 0.0, dtype)->output();

  auto *topk = createTopk(graph, {uniform, size_of_permutation}, 0,
                          true /*largest*/, true /*sorted*/);

  return createCast(graph, topk->output(1), c10::ScalarType::Int);
}

} // namespace

__attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() {
  registerHandler(c10::aten::arange, arangeHandler);
  registerHandler(c10::aten::ones, onesZerosHandler);
  registerHandler(c10::aten::ones_like, onesZerosHandler);
  registerHandler(c10::aten::new_ones, onesZerosHandler);
  registerHandler(c10::aten::new_zeros, onesZerosHandler);
  registerHandler(c10::aten::zeros, onesZerosHandler);
  registerHandler(c10::aten::zeros_like, onesZerosHandler);
  registerHandler(c10::aten::randperm, randpermHandler);
}

} // namespace poptorch


================================================
FILE: poptorch/source/popart_canonicalization/ConvolutionOps.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#include "../PoptorchStaticInit.hpp"
#include "PopartCanonicalizationUtils.hpp"

#include "poptorch/OpBuilder.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

namespace poptorch {
namespace {

torch::jit::Node *convolutionHandler(torch::jit::Graph *graph,
                                     torch::jit::Node *node) {
  // aten::_convolution(Tensor input, Tensor weight, Tensor? bias, int[]
  //                    stride, int[] padding, int[] dilation, bool transposed,
  //                    int[] output_padding, int groups) -> Tensor
  const bool transposed = constantToBool(node->input(6)->node());

  torch::jit::Value *input = node->input(0);
  torch::jit::Value *kernel = node->input(1);
  torch::jit::Value *bias = node->input(2);

  castWeightAndBias(graph, input, kernel, bias);

  std::vector<torch::jit::Value *> inputs{input, kernel};

  if (!isNone(bias->node())) {
    inputs.push_back(bias);
  }

  const std::vector<std::int64_t> stride =
      constantToLongVec(node->input(3)->node());

  std::vector<std::int64_t> padding = constantToLongVec(node->input(4)->node());

  // Pytorch gives the padding as being the amount to pad in both
  // directions. Popart two arguments for each axis, the amount to pad in
  // each direction along that axis. In the form (Axis0Left, AxisNLeft...,
  // Axis0Right, AxisNRight) where left and right refer to the direction
  // along the axis to add zeros to.
  const std::size_t num_pads = padding.size();
  for (std::size_t pad_index = 0; pad_index < num_pads; ++pad_index) {
    padding.push_back(padding[pad_index]);
  }

  const std::vector<std::int64_t> dilation =
      constantToLongVec(node->input(5)->node());

  const std::vector<std::int64_t> output_padding =
      constantToLongVec(node->input(7)->node());

  std::int64_t const groups = constantToLong(node->input(8)->node());

  if (!transposed) {
    // Create a "normal" convolution.

    // output_padding should be zero except for conv transpose
    for (auto out_pad : output_padding) {
      ERROR_ON(out_pad > 0);
    }

    return createConv(graph, inputs, dilation, groups, {}, padding, stride);
  }

  return createConvtranspose(graph, inputs, dilation, groups, {},
                             output_padding, {}, padding, stride);
}

torch::jit::Node *conv2dHandler(torch::jit::Graph *graph,
                                torch::jit::Node *node) {
  // aten::conv2d(Tensor input, Tensor weight, Tensor? bias, int[] stride,
  //              int[] padding, int[] dilation, int groups) -> Tensor

  // Or:

  // aten::mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, int[]
  // padding, int[] stride, int[] dilation, int groups) -> (Tensor)

  torch::jit::Value *input = node->input(0);
  torch::jit::Value *kernel = node->input(1);
  torch::jit::Value *bias = node->input(2);

  castWeightAndBias(graph, input, kernel, bias);

  std::vector<torch::jit::Value *> inputs{input, kernel};

  if (!isNone(bias->node())) {
    inputs.push_back(bias);
  }

  const bool is_mkldnn_conv = node->kind() == c10::aten::mkldnn_convolution;
  const std::uint32_t stride_index = is_mkldnn_conv ? 4 : 3;
  const std::uint32_t padding_index = is_mkldnn_conv ? 3 : 4;

  const std::vector<std::int64_t> stride =
      constantToLongVec(node->input(stride_index)->node());
  std::vector<std::int64_t> padding =
      constantToLongVec(node->input(padding_index)->node());

  // Pytorch gives the padding as being the amount to pad in both
  // directions. Popart two arguments for each axis, the amount to pad in
  // each direction along that axis. In the form (Axis0Left, AxisNLeft...,
  // Axis0Right, AxisNRight) where left and right refer to the direction
  // along the axis to add zeros to.
  const std::size_t num_pads = padding.size();
  for (std::size_t pad_index = 0; pad_index < num_pads; ++pad_index) {
    padding.push_back(padding[pad_index]);
  }

  const std::vector<std::int64_t> dilation =
      constantToLongVec(node->input(5)->node());
  std::int64_t const groups = constantToLong(node->input(6)->node());

  return poptorch::createConv(graph, inputs, dilation, groups, {}, padding,
                              stride);
}

torch::jit::Node *cumsumHandler(torch::jit::Graph *graph,
                                torch::jit::Node *node) {
  torch::jit::Value *data = node->input(0);
  std::vector<int64_t> data_shape = shapeFromTensor(data);
  int64_t dim = constantToLong(node->input(1)->node());
  const int64_t r = static_cast<int64_t>(data_shape.size());
  ERROR_ON_MSG(dim < -r || dim > r - 1, "Dimension out of range.");

  if (dim < 0) {
    dim += r;
  }

  // By default, the output's `dtype` should match the input's.
  at::ScalarType requested_output_dtype = getNodeScalarType(data);

  if (node->inputs().size() == 4) {
    // We've been called with the form `torch.cumsum(..., out=output)`, so the
    // output tensor's `dtype` gets used as per the `torch.cumsum` spec.
    requested_output_dtype = getNodeScalarType(node->input(3));
  } else if (!isNone(node->input(2))) {
    // We've been called with an explicit `dtype`, so use that.
    requested_output_dtype = constantToScalarType(node->input(2)->node());
  }

  // We have to cast the input tensor to the output `dtype` *before* doing the
  // sum, to conform with the API of `torch.cumsum`.
  data = createCast(graph, data, requested_output_dtype)->output();

  // The 1-D conv kernel span is the size in the dim we are reducing along
  const int64_t span = data_shape[static_cast<std::size_t>(dim)];

  if (span < 2) {
    // cumsum in singleton dimension or scalar/empty
    return createIdentity(graph, {data});
  }

  // Create the 1-d conv kernel
  const std::vector<double> kernel_data(static_cast<std::size_t>(span), 1.0);
  torch::jit::Value *ones =
      createConstantFloatLike(graph, data, kernel_data, {span})->output();

  // ONNX conv expects the kernel to have size M x C/group X kW X kW
  // So reshape the kernel to have size [1,1,span,1]
  std::vector<int64_t> kernel_shape(4, 1);
  kernel_shape[2] = span;
  torch::jit::Value *k = createReshape(graph, ones, kernel_shape)->output();

  if (dim != 0) {
    // Transpose input so that we can apply the 1-d conv assuming dim==0
    std::vector<int64_t> p(r);
    std::iota(p.begin(), p.end(), 0);
    std::swap(p[0], p[dim]);
    data = createTranspose(graph, {data}, p)->output();
    std::swap(data_shape[0], data_shape[dim]);
  }

  // Coerce into [N,M] 2-d tensor
  if (r < 2) {
    data = createUnsqueeze(graph, {data}, {1})->output();
  }
  if (r > 2) {
    data = createFlatten(graph, {data}, 1)->output();
  }

  // ONNX conv expects the input data to have size batch X channel x H X W
  // So we reshape the [N,M] 2-d data to [M,1,N,1] and apply the 1-d conv
  // kernel of ones with [span-1,0] padding above and below.
  torch::jit::Value *x = createUnsqueeze(graph, {data}, {2, 3})->output();
  x = createTranspose(graph, {x}, {1, 2, 0, 3})->output();
  x = createCast(graph, x, c10::ScalarType::Float)->output();

  torch::jit::Value *y =
      createConv(graph, {x, k}, {}, 1, {}, {span - 1, 0, 0, 0}, {})->output();

  // Unfortunately we have to cast again here, because `createConv` always
  // returns a float-typed tensor. We can't *only* cast here either, because
  // cast -> sum != sum -> cast when going from float to int, and the spec of
  // `torch.cumsum` says to cast first. If we don't cast at all, info about our
  // size doesn't get transmitted to later ops relying on us (eg. `select`).
  y = createCast(graph, y, requested_output_dtype)->output();

  // Work back to the correct expected output shape
  y = createTranspose(graph, {y}, {2, 0, 1, 3})->output();
  y = createReshape(graph, y, data_shape)->output();

  if (dim != 0) {
    // Transpose back to the original axes orientation.
    std::vector<int64_t> p(r);
    std::iota(p.begin(), p.end(), 0);
    std::swap(p[0], p[dim]);
    y = createTranspose(graph, {y}, p)->output();
  }

  return y->node();
}

torch::jit::Node *cumprodHandler(torch::jit::Graph *graph,
                                 torch::jit::Node *node) {
  torch::jit::Value *data = node->input(0);
  const std::vector<int64_t> data_shape = shapeFromTensor(data);
  const int64_t dim = handleDimensionParam(
      node->input(1), data->type()->expect<c10::TensorType>());
  const int64_t num_iters = data_shape.at(dim);

  auto *result = createIdentity(graph, {data});
  result->output()->setType(
      result->output()->type()->expect<c10::TensorType>()->withSizes(
          data_shape));
  auto select_handler = getHandler(c10::aten::select);

  for (int64_t i = 1; i < num_iters; ++i) {
    const auto src_slice_idx = i - 1;
    const auto dst_slice_idx = i;

    auto *const src = createSlice(graph, {result->output()},
                                  {src_slice_idx + 1}, {src_slice_idx}, {dim})
                          ->output();
    auto *const dst = createSlice(graph, {result->output()},
                                  {dst_slice_idx + 1}, {dst_slice_idx}, {dim})
                          ->output();
    auto *const new_val = createMul(graph, {src, dst})->output();

    const std::vector<torch::jit::Value *> args{
        result->output(), wrapInConstantVec(graph, {dst_slice_idx}), new_val};

    result = createDynamicupdate(graph, args, {dim}, {1}, 0);
    result->output()->setType(
        result->output()->type()->expect<c10::TensorType>()->withSizes(
            data_shape));
  }

  return result;
}

} // namespace

__attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() {
  registerHandler(c10::aten::convolution, convolutionHandler);
  registerHandler(c10::aten::convolution_overrideable, convolutionHandler);
  registerHandler(c10::aten::_convolution, convolutionHandler);
  registerHandler(c10::aten::mkldnn_convolution, conv2dHandler);
  registerHandler(c10::aten::conv2d, conv2dHandler);
  registerHandler(c10::aten::cumsum, cumsumHandler);
  registerHandler(c10::aten::cumprod, cumprodHandler);
}

} // namespace poptorch


================================================
FILE: poptorch/source/popart_canonicalization/CustomOps.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#include <torch/csrc/jit/ir/ir.h>

#include "../PoptorchStaticInit.hpp"
#include "../PoptorchSymbols.hpp"
#include "PopartCanonicalizationUtils.hpp"
#include "poptorch/OpBuilder.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"

namespace poptorch {
namespace {

torch::jit::Node *customOpHandler(torch::jit::Graph *graph,
                                  torch::jit::Node *node) {
  std::vector<torch::jit::Value *> inputs =
      handleTensorList(node->input(0)->node());
  std::string name = constantToString(node->input(1)->node());
  std::string domain = constantToString(node->input(2)->node());

  // Get the domain version.
  std::int64_t domain_version = constantToLong(node->input(3)->node());

  // Get the number of outputs.
  std::int64_t num_outputs = constantToLong(node->input(4)->node());

  // The attributes are in the Python dict represented by an id within a string
  auto attributes_id_str = constantToString(node->input(6)->node());

  // Add the custom op with a variadic number of outputs.
  torch::jit::Node *custom_op =
      createCustomOperation(graph, inputs, name, domain, domain_version,
                            num_outputs, attributes_id_str);

  // It is replacing an operation which returned a list so add a list
  // construct to keep the IR legal.
  return createAndInsertNode(graph, at::prim::ListConstruct,
                             custom_op->outputs());
}

} // namespace

__attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() {
  registerHandler(symbols::poptorch::custom_operation, customOpHandler);
}

} // namespace poptorch


================================================
FILE: poptorch/source/popart_canonicalization/DistanceOps.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#include <ATen/ExpandUtils.h>

#include "../PoptorchStaticInit.hpp"
#include "PopartCanonicalizationUtils.hpp"

#include "poptorch/OpBuilder.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"

namespace poptorch {
namespace {

torch::jit::Node *pairwiseDistanceHandler(torch::jit::Graph *graph,
                                          torch::jit::Node *node) {
  // aten::pairwise_distance(Tensor x1, Tensor x2, float p, float eps,
  //                         bool keepdim)

  // Input 1
  auto *x1 = node->input(0);
  // Input 2
  auto *const x2 = node->input(1);
  // Norm degree
  auto *const p = node->input(2);
  // Small value to avoid division by zero
  auto *const eps = node->input(3);
  // Whether to keep vector dimension
  auto *const keepdim = node->input(4);
  auto input_shape = shapeFromTensor(x1);
  bool reshape_output = false;

  // No batch dim, append one to front
  // (D) -> (N, D), N = 1
  if (input_shape.size() == 1) {
    input_shape = {1, input_shape[0]};
    x1 = createUnsqueeze(graph, {x1}, {0})->output();
    reshape_output = true;
  }

  // x1 - x2
  auto *const x1_minus_x2 = createSub(graph, {x1, x2})->output();
  // x1 - x2 + eps
  auto *const x1_minus_x2_plus_eps =
      createAdd(graph, {x1_minus_x2, eps})->output();
  x1_minus_x2_plus_eps->setType(
      x1_minus_x2_plus_eps->type()->expect<c10::TensorType>()->withSizes(
          input_shape));

  // 1
  auto *const ones = wrapInConstant1D(graph, 1);
  // tensorNormHandler expects ListConstruct for dims
  torch::jit::Node *const ones_list =
      createAndInsertNode(graph, c10::prim::ListConstruct, {ones});

  std::vector<torch::jit::Value *> norm_inputs = {x1_minus_x2_plus_eps, p,
                                                  ones_list->output(), keepdim};
  // norm(x1 - x2 + eps, p, 1, keepdim)
  auto *out =
      createHandlerOperation(graph, getHandler(c10::aten::norm), norm_inputs);

  // If passed inputs of size (1, N), the output of norm will have shape
  // torch.Size([1]), but torch outputs torch.Size([]), so reshape
  if (reshape_output) {
    out = createReshape(graph, out->output(), shapeFromTensor(node->output(0)));
  }

  return out;
}

torch::jit::Node *cosineSimilarityHandler(torch::jit::Graph *graph,
                                          torch::jit::Node *node) {
  // aten::cosine_similarity(const Tensor& x1, const Tensor& x2, int64_t dim,
  //                         double eps)

  // inputs
  auto *const x1 = node->input(0);
  auto *const x2 = node->input(1);
  const auto dim = constantToLong(node->input(2)->node());
  auto *const eps = node->input(3);

  // dividend
  auto *const mul12 = createMul(graph, {x1, x2})->output();
  auto *const dot12 = createReducesum(graph, {mul12}, {dim}, 0)->output();

  // divisor
  auto *const mag1_sq = createReducesumsquare(graph, {x1}, {dim}, 0)->output();
  auto *const mag2_sq = createReducesumsquare(graph, {x2}, {dim}, 0)->output();
  auto *const mag12_sq = createMul(graph, {mag1_sq, mag2_sq})->output();
  auto *const mag12 = createSqrt(graph, {mag12_sq})->output();
  auto *const mag12_nonzero = createMax(graph, {mag12, eps})->output();

  return createDiv(graph, {dot12, mag12_nonzero});
}

torch::jit::Node *cdistHandler(torch::jit::Graph *graph,
                               torch::jit::Node *node) {
  // Input 1
  auto *const x1 = node->input(0);
  // Input 2
  auto *const x2 = node->input(1);
  // Norm degree
  auto *const p_degree = node->input(2);

  const std::vector<std::int64_t> x1_shape = shapeFromTensor(x1);
  const std::vector<std::int64_t> x2_shape = shapeFromTensor(x2);

  const auto ndim_x1 = x1_shape.size();
  const auto ndim_x2 = x2_shape.size();

  std::vector<std::int64_t> x1_shape_expanded;
  std::vector<std::int64_t> x2_shape_expanded;

  if (ndim_x1 > 0) {
    const auto m = x1_shape.at(ndim_x1 - 1);
    x1_shape_expanded.push_back(m);
  }
  if (ndim_x2 > 0) {
    const auto m = x2_shape.at(ndim_x2 - 1);
    x2_shape_expanded.push_back(m);
  }

  if (ndim_x1 > 1) {
    const auto p = x1_shape.at(ndim_x1 - 2);
    x1_shape_expanded.insert(x1_shape_expanded.begin(), {p, 1});
  }

  if (ndim_x2 > 1) {
    const auto r = x2_shape.at(ndim_x2 - 2);
    x2_shape_expanded.insert(x2_shape_expanded.begin(), {1, r});
  }

  std::vector<std::int64_t> b_x1;
  std::vector<std::int64_t> b_x2;

  if (ndim_x1 > 2) {
    b_x1 = {x1_shape.begin(), x1_shape.end() - 2};
  }

  if (ndim_x2 > 2) {
    b_x2 = {x2_shape.begin(), x2_shape.end() - 2};
  }

  if (b_x1 != b_x2) {
    const auto get_broadcasted_batch_shape =
        [](const std::vector<int64_t> &batch_shape,
           const std::vector<int64_t> &inferred_size) {
          if (batch_shape == inferred_size) {
            return batch_shape;
          }

          std::vector<std::int64_t> broadcasted_shape;

          const auto batch_shape_size = batch_shape.size();

          std::for_each(
              inferred_size.crbegin(), inferred_size.crend(),
              [cnt = 0u, batch_shape_size, &broadcasted_shape,
               &batch_shape](const auto &inferred_value) mutable {
                if (cnt >= batch_shape_size) {
                  broadcasted_shape.insert(broadcasted_shape.begin(), 1);
                  return;
                }

                const auto batch_shape_value =
                    batch_shape.at(batch_shape_size - cnt - 1);

                if (inferred_value != batch_shape_value &&
                    batch_shape_value != 1) {
                  broadcasted_shape.insert(broadcasted_shape.begin(), 1);
                } else {
                  broadcasted_shape.insert(broadcasted_shape.begin(),
                                           batch_shape_value);
                  ++cnt;
                }
              });

          return broadcasted_shape;
        };

    const std::vector<int64_t> expand_batch_portion =
        at::infer_size(b_x1, b_x2);

    b_x1 = get_broadcasted_batch_shape(b_x1, expand_batch_portion);
    b_x2 = get_broadcasted_batch_shape(b_x2, expand_batch_portion);
  }

  x1_shape_expanded.insert(x1_shape_expanded.begin(), b_x1.cbegin(),
                           b_x1.cend());
  x2_shape_expanded.insert(x2_shape_expanded.begin(), b_x2.cbegin(),
                           b_x2.cend());

  auto *x1_expanded = createReshape(graph, x1, x1_shape_expanded)->output();
  auto *x2_expanded = createReshape(graph, x2, x2_shape_expanded)->output();

  auto *x1_minus_x2 = createSub(graph, {x1_expanded, x2_expanded})->output();
  auto *dims = createAndInsertNode(graph, c10::prim::ListConstruct,
                                   {wrapInConstant1D(graph, -1)})
                   ->output();
  auto *keepdim = createConstantLong(graph, {0}, {1})->output();

  return createHandlerOperation(graph, getHandler(c10::aten::norm),
                                {x1_minus_x2, p_degree, dims, keepdim});
}

} // namespace

__attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() {
  registerHandler(c10::aten::pairwise_distance, pairwiseDistanceHandler);
  registerHandler(c10::aten::cosine_similarity, cosineSimilarityHandler);
  registerHandler(c10::aten::cdist, cdistHandler);
  registerHandler(c10::aten::_cdist_forward, cdistHandler);
}
} // namespace poptorch


================================================
FILE: poptorch/source/popart_canonicalization/DropoutOps.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#include "../PoptorchStaticInit.hpp"
#include "../PoptorchSymbols.hpp"
#include "PopartCanonicalizationUtils.hpp"
#include "poptorch/OpBuilder.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"

namespace poptorch {
namespace {

torch::jit::Node *dropoutHandler(torch::jit::Graph *graph,
                                 torch::jit::Node *node) {
  auto *x = node->input(0);
  auto p = constantToFloat(node->input(1)->node());
  auto train = constantToBool(node->input(2)->node());

  if (!train) {
    return createIdentity(graph, {x});
  }

  return createDropout(graph, {x}, 1, p);
}

torch::jit::Node *featureDropoutHandler(torch::jit::Graph *graph,
                                        torch::jit::Node *node) {
  torch::jit::Value *input = node->input(0);
  float ratio = constantToFloat(node->input(1)->node());
  bool train = constantToBool(node->input(2)->node());

  if (!train) {
    return createIdentity(graph, {input});
  }

  // Input tensor is required to be more than 2-d since feature dropout assumes
  // that the input represents a 2-d map of features: N x C x (feature shape)
  std::vector<int64_t> drop_shape = shapeFromTensor(input);
  ERROR_ON_MSG(drop_shape.size() < 2,
               "Feature dropout requires at least 2 dimensions in the input");

  // The dropout mask shape will be N x C with as many trailing singleton
  // dimensions as needed to meet the broadcast requirement
  std::fill(drop_shape.begin() + 2, drop_shape.end(), 1);

  return createShapeddropout(graph, {input}, drop_shape, ratio);
}

} // namespace

__attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() {
  registerHandler(c10::aten::feature_dropout, featureDropoutHandler);
  registerHandler(c10::aten::dropout, dropoutHandler);
}

} // namespace poptorch


================================================
FILE: poptorch/source/popart_canonicalization/EinsumOp.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#include "EinsumOp.hpp"
#include "PopartCanonicalizationUtils.hpp"

#include "poptorch/OpBuilder.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"

namespace poptorch {

EinsumOp::EinsumOp(std::string eq,
                   const std::vector<torch::jit::Value *> &tensors) {
  _tensors = tensors;
  // Remove all whitespace in equation
  eq.erase(std::remove(eq.begin(), eq.end(), ' '), eq.end());
  _lhs = eq;

  auto pos = eq.find("->");
  if (pos != std::string::npos) {
    _lhs = eq.substr(0, pos);
    // Add 2 to exclude arrow
    _rhs = eq.substr(pos + 2);
  }

  // Split lhs into labels using ',' delimiter
  std::stringstream ss(_lhs);
  std::string s;
  while (std::getline(ss, s, ',')) {
    _labels.push_back(s);
  }
  ERROR_ON(_labels.size() != _tensors.size());

  for (const auto &label : _labels) {
    for (char c : label) {
      if (_lhs_char_indices.find(c) == _lhs_char_indices.end()) {
        _lhs_char_indices[c] = _ordered_chars.size();
        _char_counts_seen[c] = 0;
        _char_counts_remaining[c] = 1;
        _ordered_chars.push_back(c);
      } else {
        _char_counts_remaining[c]++;
      }
    }
  }
  // Shared rank of tensors during multiplication
  _n_dims = _ordered_chars.size();

  // Calculate implicit rhs according to classical einstein summation
  if (pos == std::string::npos) {
    std::copy_if(_ordered_chars.begin(), _ordered_chars.end(),
                 std::back_inserter(_rhs),
                 [&](char c) { return _char_counts_remaining[c] == 1; });
    // Must be alphabetical in this case
    std::sort(_rhs.begin(), _rhs.end());
  }

  _rdims_bs.resize(_n_dims);
  _bdims_bs.resize(_n_dims);
  _rhs_bs.resize(_n_dims);
  for (char c : _rhs) {
    _rhs_bs[_lhs_char_indices[c]] = true;
    _rhs_char_indices[c] = _rhs_char_indices.size();
  }
  // All characters must be present in the map but only the indices of rhs
  // characters matter
  for (char c : _lhs) {
    _rhs_char_indices.emplace(c, 0);
  }
}

torch::jit::Node *
EinsumOp::create(torch::jit::Graph *graph,
                 const std::vector<std::int64_t> &output_shape) {
  canonicalizeTensors(graph);

  torch::jit::Node *output = nullptr;

  // One tensor means summation or transpose is applied
  if (_tensors.size() == 1) {
    if (_lhs.size() > _rhs.size()) {
      std::vector<std::int64_t> axes;

      for (std::size_t i = 0; i < _n_dims; i++) {
        if (!_rhs_bs[i]) {
          axes.push_back(static_cast<std::int64_t>(i));
        }
      }
      output = createReducesum(graph, {_tensors[0]}, axes, 1);
    } else {
      std::vector<std::int64_t> p_lhs =
          sortedPermutation(_rhs_char_indices, _labels[0]);

      output = createTranspose(graph, {_tensors[0]}, p_lhs);
    }
  } else {
    updateCharCounts(_labels[0]);
    // Base output
    output = _tensors[0]->node();
    // Build product from left to right
    for (std::size_t i = 1; i < _tensors.size(); i++) {
      output = createProduct(graph, output->output(), _tensors[i], _labels[i]);
    }
    output = permuteOutput(graph, output->output());
  }

  // Remove reduced single dimensions by reshaping
  return createReshape(graph, output->output(), output_shape);
}

torch::jit::Node *EinsumOp::tensordotBmm(torch::jit::Graph *graph,
                                         torch::jit::Value *x1,
                                         torch::jit::Value *x2) const {
  const std::vector<std::int64_t> shape_x1 = shapeFromTensor(x1);
  const std::vector<std::int64_t> shape_x2 = shapeFromTensor(x2);
  ERROR_ON(shape_x1.size() != shape_x2.size());

  std::int64_t rdims_prod = 1;
  std::int64_t bdims_prod = 1;
  for (std::size_t i = 0; i < _n_dims; i++) {
    if (_rdims_bs[i]) {
      if (shape_x1[i] == shape_x2[i]) {
        rdims_prod *= shape_x1[i];
      } else if (shape_x1[i] == 1) {
        x2 = createReducesum(graph, {x2}, {static_cast<std::int64_t>(i)}, 1)
                 ->output();
      } else if (shape_x2[i] == 1) {
        x1 = createReducesum(graph, {x1}, {static_cast<std::int64_t>(i)}, 1)
                 ->output();
      }
    }
    if (_bdims_bs[i]) {
      bdims_prod *= shape_x1[i];
    }
  }

  // Partitions existing permutation vector p according to bitset bs. If
  // should_partition_front == true, elements of p are moved to the front
  // if the corresponding bool in bs == true. Otherwise, they are moved to
  // the back. The relative order of other elements must not change.
  auto fn_partition = [&](auto &p, const auto &bs,
                          bool should_partition_front) {
    std::stable_partition(p.begin(), p.end(), [&](std::int64_t n) {
      return bs[n] == should_partition_front;
    });
  };

  // Original permutation
  std::vector<std::int64_t> p1(_n_dims);
  std::iota(p1.begin(), p1.end(), 0);
  std::vector<std::int64_t> p2 = p1;

  // Cast the reduction to a batch matrix multiplication by permuting input
  // dimensions and reshaping to ensure there is one batch dimension and
  // one reduce (dot product) dimension.

  // Permute x1 so that rdims are the last dims
  fn_partition(p1, _rdims_bs, false);
  // Permute again so that bdims are the first dims
  fn_partition(p1, _bdims_bs, true);
  torch::jit::Node *p_x1 = createTranspose(graph, {x1}, p1);
  // Reshape to (bdims_prod, -1, rdims_prod)
  torch::jit::Node *p_x1_bmat =
      createReshape(graph, p_x1->output(), {bdims_prod, -1, rdims_prod});

  // Permute x2 so that rdims are the first dims
  fn_partition(p2, _rdims_bs, true);
  // Permute again so that bdims are the first dims and rdims follow
  fn_partition(p2, _bdims_bs, true);
  torch::jit::Node *p_x2 = createTranspose(graph, {x2}, p2);
  // Reshape to (bdims_prod, rdims_prod, -1)
  torch::jit::Node *p_x2_bmat =
      createReshape(graph, p_x2->output(), {bdims_prod, rdims_prod, -1});

  // Matmul -> (bdims_prod, unreduced_x1, unreduced_x2)
  torch::jit::Node *mm =
      createMatmul(graph, {p_x1_bmat->output(), p_x2_bmat->output()});

  std::vector<std::int64_t> new_shape;
  for (std::size_t i = 0; i < _n_dims; i++) {
    if (_bdims_bs[i]) {
      new_shape.push_back(shape_x1[i]);
    }
  }
  for (std::size_t i = 0; i < _n_dims; i++) {
    if (_rdims_bs[i]) {
      new_shape.push_back(1);
    } else if (!_bdims_bs[i]) {
      // If not a batch dim or reduce dim, at least one dim == 1
      // so we can multiply to get the right result
      new_shape.push_back(shape_x1[i] * shape_x2[i]);
    }
  }

  // Restore flattened dims
  return createReshape(graph, mm->output(), new_shape);
}

void EinsumOp::canonicalizeTensors(torch::jit::Graph *graph) {
  for (std::size_t i = 0; i < _tensors.size(); i++) {
    torch::jit::Value *t = _tensors[i];
    std::vector<std::int64_t> shape = shapeFromTensor(t);

    // Get permute indices of lhs
    std::vector<std::int64_t> p_lhs =
        sortedPermutation(_lhs_char_indices, _labels[i]);

    // Calculate permuted shape and label
    std::vector<std::int64_t> shape_p;
    std::transform(p_lhs.begin(), p_lhs.end(), std::back_inserter(shape_p),
                   [&](auto d) { return shape[d]; });

    // TODO(T60456): Implement diagonals whenever ai.onnx.EyeLike is implemented
    //               in PopART

    // Insert missing dims
    for (std::size_t j = 0; j < _ordered_chars.size(); j++) {
      if (_labels[i].find(_ordered_chars[j]) == std::string::npos) {
        shape_p.insert(shape_p.begin() + j, 1);
      }
    }

    // Permute and reshape
    t = createTranspose(graph, {t}, p_lhs)->output();
    _tensors[i] = createReshape(graph, t, shape_p)->output();
  }
}

torch::jit::Node *EinsumOp::permuteOutput(torch::jit::Graph *graph,
                                          torch::jit::Value *output) const {
  std::vector<char> out_chars = _ordered_chars;
  std::stable_partition(out_chars.begin(), out_chars.end(), [&](char c) {
    return _bdims_bs[_lhs_char_indices.at(c)];
  });

  // Permute batch dims back to original locations
  std::vector<std::int64_t> p_lhs =
      sortedPermutation(_lhs_char_indices, out_chars);

  // Permute to the order specified by rhs
  std::vector<std::int64_t> p_rhs =
      sortedPermutation(_rhs_char_indices, _ordered_chars);

  // Combine permutations
  std::vector<std::int64_t> p_combined;
  std::transform(p_rhs.begin(), p_rhs.end(), std::back_inserter(p_combined),
                 [&](auto d) { return p_lhs[d]; });

  return createTranspose(graph, {output}, p_combined);
}

void EinsumOp::updateCharCounts(const std::string &label) {
  for (char c : label) {
    _char_counts_seen[c]++;
    _char_counts_remaining[c]--;
  }
}

torch::jit::Node *EinsumOp::createProduct(torch::jit::Graph *graph,
                                          torch::jit::Value *lhs,
                                          torch::jit::Value *rhs,
                                          const std::string &rhs_label) {
  updateCharCounts(rhs_label);

  for (std::size_t i = 0; i < _n_dims; i++) {
    char c = _ordered_chars[i];
    // if dim appears in rhs, don't reduce
    // if dim appears in future operands, don't reduce yet
    _rdims_bs[i] =
        !_rhs_bs[i] && _char_counts_remaining[_ordered_chars[i]] == 0;
    _bdims_bs[i] = !_rdims_bs[i] && _char_counts_seen[c] > 1;
  }

  return tensordotBmm(graph, lhs, rhs);
}
} // namespace poptorch


================================================
FILE: poptorch/source/popart_canonicalization/EinsumOp.hpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#include <torch/csrc/jit/ir/ir.h>

#include <algorithm>
#include <string>
#include <unordered_map>
#include <vector>

namespace poptorch {
class EinsumOp {
public:
  EinsumOp(std::string eq, const std::vector<torch::jit::Value *> &tensors);

  torch::jit::Node *create(torch::jit::Graph *graph,
                           const std::vector<std::int64_t> &output_shape);

private:
  // A modified version of tensordot that handles batch dimensions and takes
  // two tensors of the same rank that have been unsqueezed (if necessary) to
  // match. The output is of the same rank. Batch dims always appear first in
  // the output to allow chaining.
  torch::jit::Node *tensordotBmm(torch::jit::Graph *graph,
                                 torch::jit::Value *x1,
                                 torch::jit::Value *x2) const;

  // Get permute indices of 's' according to the order specified by char_indices
  template <typename T>
  std::vector<std::int64_t>
  sortedPermutation(const std::unordered_map<char, std::size_t> &char_indices,
                    const T &s) const {
    std::vector<std::int64_t> p(s.size());
    std::iota(p.begin(), p.end(), 0);
    std::sort(p.begin(), p.end(), [&](auto d1, auto d2) {
      return char_indices.at(s[d1]) < char_indices.at(s[d2]);
    });
    return p;
  }

  // Ensure all tensors have same number of dims that are in the same order -
  // The order in which they appear in the lhs
  void canonicalizeTensors(torch::jit::Graph *graph);

  // Combines the following permutations into a single permutation:
  // 1) Permuting batch dims to their original locations
  // 2) Permuting to the order specified by the rhs
  torch::jit::Node *permuteOutput(torch::jit::Graph *graph,
                                  torch::jit::Value *output) const;

  // Updates char counts used to calculate reduce dims and batch dims
  void updateCharCounts(const std::string &label);

  torch::jit::Node *createProduct(torch::jit::Graph *graph,
                                  torch::jit::Value *lhs,
                                  torch::jit::Value *rhs,
                                  const std::string &rhs_label);

  std::vector<torch::jit::Value *> _tensors;
  std::string _lhs, _rhs;
  std::vector<std::string> _labels;
  std::size_t _n_dims;
  // List of characters ordered as seen from left to right. This
  // is the order of dims during the multiply/reduce stage
  std::vector<char> _ordered_chars;
  // Used to determine whether a non-reduce dimension should be
  // considered a batch dimension during calculation
  std::unordered_map<char, int> _char_counts_seen;
  // Number of times a character appears in future operands -
  // used to determine whether a dimension should be reduced
  std::unordered_map<char, int> _char_counts_remaining;
  // Mapping of each character to the index in which it appears
  // in the intermediate tensor shape
  std::unordered_map<char, std::size_t> _lhs_char_indices;
  // Mapping of each character to the index in which it appears
  // in the output shape
  std::unordered_map<char, std::size_t> _rhs_char_indices;
  // Bitset indicating dimensions to be reduced
  std::vector<bool> _rdims_bs;
  // Bitset indicating batch dimensions
  std::vector<bool> _bdims_bs;
  // Bitset indicating dimensions that appear in rhs
  std::vector<bool> _rhs_bs;
}; // class einsum
} // namespace poptorch


================================================
FILE: poptorch/source/popart_canonicalization/EmbeddingOps.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#include <spdlog/fmt/fmt.h>
#include <spdlog/fmt/ostr.h>

#include "../PoptorchStaticInit.hpp"
#include "../PoptorchSymbols.hpp"
#include "PopartCanonicalizationUtils.hpp"

#include "poptorch/OpBuilder.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

#include <ATen/ATen.h>

namespace poptorch {
namespace {

torch::jit::Node *embeddingHandler(torch::jit::Graph *graph,
                                   torch::jit::Node *node) {
  // aten::embedding(Tensor weight, Tensor indices, int padding_idx, bool
  // scale_grad_by_freq, bool sparse) -> Tensor

  const bool scale_grad_by_freq = constantToBool(node->input(3)->node());
  const bool sparse = constantToBool(node->input(4)->node());

  ERROR_ON_MSG(scale_grad_by_freq || sparse,
               "Unsupported aten::embedding operation");

  auto *weight = node->input(0);
  auto *indices = node->input(1);
  const auto padding_idx = constantToLong(node->input(2)->node());

  if (padding_idx < 0) {
    // Default: padding_idx == -1 indicates no padding.
    return createGather(graph, {node->input(0), node->input(1)}, 0);
  }

  const std::string msg = fmt::format("{{\"padding_idx\":{}}}", padding_idx);

  auto *out = createCustomOperation(graph, {weight, indices}, "Embedding",
                                    "poptorch.custom_ops", 1, 1, msg);

  const auto input_type = getNodeScalarType(weight);
  const auto out_type = c10::TensorType::create(input_type, c10::nullopt,
                                                c10::nullopt, c10::nullopt);
  out->output(0)->setType(out_type);
  return out;
}

torch::jit::Node *embeddingBagHandler(torch::jit::Graph *graph,
                                      torch::jit::Node *node) {
  // aten::embedding_bag(Tensor weight, Tensor indices, Tensor offsets, bool
  // scale_grad_by_freq, int mode, bool sparse, Tensor per_sample_weights, bool
  // include_last_offset, int? padding_idx) -> Tensor

  const bool scale_grad_by_freq = constantToBool(node->input(3)->node());
  const bool sparse = constantToBool(node->input(5)->node());
  auto *padding_idx = node->input(8);

  ERROR_ON_MSG(scale_grad_by_freq || sparse,
               "Unsupported aten::embedding_bag operation");

  if (!isNone(padding_idx)) {
    const auto padding_idx_val = constantToInt(node->input(8)->node());
    ERROR_ON_MSG(padding_idx_val >= 0,
                 "Unsupported aten::embedding_bag operation: padding_idx "
                 "parameter is unsupported.");
  }

  // aten::embedding_bag has 4 outputs but only the first one is used so we
  // delete them here to match our output
  while (node->outputs().size() > 1) {
    node->eraseOutput(node->outputs().size() - 1);
  }

  auto *weight = node->input(0);
  auto *indices = node->input(1);
  auto *offsets = node->input(2);
  const int64_t mode = constantToLong(node->input(4)->node());
  auto *per_sample_weights = node->input(6);
  const bool include_last_offset = constantToBool(node->input(7)->node());

  const auto reduction = [mode](torch::jit::Graph *g, torch::jit::Value *v) {
    if (mode == 0) {
      return createReducesum(g, {v}, {0}, 1)->output();
    }
    if (mode == 1) {
      return createReducemean(g, {v}, {0}, 1)->output();
    }
    return createReducemax(g, {v}, {0}, 1)->output();
  };

  ERROR_ON_MSG(!isTensorConstant(offsets->node()),
               "Unsupported aten::embedding_bag operation: offsets tensor must "
               "be a constant.");
  auto offsets_tensor = getNodeTensorAttrValue(offsets->node());

  if (!include_last_offset) {
    // Append INT_MAX to use as the last offset slice
    offsets_tensor = at::cat({offsets_tensor, at::tensor(INT_MAX)});
  }

  const auto slices = offsets_tensor.accessor<int32_t, 1>();
  torch::jit::value_list values;

  // Use the offsets to extract each bag from the indices.
  // For each bag: Gather then reduce from the embedding matrix
  for (int64_t i = 0; i < offsets_tensor.size(0) - 1; i++) {
    auto *bag = createSlice(graph, {indices}, {slices[i + 1]}, {slices[i]}, {0})
                    ->output();
    auto *gather = createGather(graph, {weight, bag}, 0)->output();

    if (!isNone(per_sample_weights)) {
      auto *psw = createSlice(graph, {per_sample_weights}, {slices[i + 1]},
                              {slices[i]}, {0})
                      ->output();
      psw = createUnsqueeze(graph, {psw}, {1})->output();
      gather = createMul(graph, {gather, psw})->output();
    }

    values.push_back(reduction(graph, gather));
  }

  return createConcat(graph, values, 0);
}

torch::jit::Node *onehotHandler(torch::jit::Graph *graph,
                                torch::jit::Node *node) {
  torch::jit::Value *tensor = node->input(0);

  std::int64_t const num_classes = constantToLong(node->input(1)->node());

  ERROR_ON_MSG(num_classes == -1,
               "OneHot num classes must be specified and must be constant.");

  // The "hot/cold" values for the one hot representation.
  torch::jit::Node *values = createConstantInt(graph, {0, 1}, {2});

  torch::jit::Node *depth = createConstantInt(graph, {num_classes}, {});

  return createOnehot(graph, {tensor, depth->output(), values->output()}, -1);
}

} // namespace

__attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() {
  registerHandler(c10::aten::embedding, embeddingHandler);
  registerHandler(c10::aten::embedding_bag, embeddingBagHandler);
  registerHandler(c10::aten::_embedding_bag, embeddingBagHandler);
  registerHandler(c10::aten::one_hot, onehotHandler);
}

} // namespace poptorch


================================================
FILE: poptorch/source/popart_canonicalization/IndexOps.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.

#include <ATen/InferSize.h>
#include <torch/csrc/jit/ir/ir.h>

#include "../PoptorchStaticInit.hpp"
#include "../PoptorchSymbols.hpp"
#include "PopartCanonicalizationUtils.hpp"

#include "ScatterReduction.hpp"

#include "poptorch/DispatchTracer.hpp"
#include "poptorch/OpBuilder.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

namespace poptorch {
namespace {

struct IndexInfo {
  torch::jit::Value *x_partial_flat;
  torch::jit::Value *indices_partial_flat;
};

std::vector<std::int64_t> padShape(const std::vector<std::int64_t> &shape,
                                   std::size_t pad, bool pad_front) {
  std::vector<std::int64_t> output_shape;
  auto ones_generator = []() { return 1; };
  if (pad_front) {
    std::generate_n(std::back_inserter(output_shape), pad, ones_generator);
  }
  std::copy(shape.begin(), shape.end(), std::back_inserter(output_shape));
  if (!pad_front) {
    std::generate_n(std::back_inserter(output_shape), pad, ones_generator);
  }
  return output_shape;
}

IndexInfo processIndex(torch::jit::Graph *graph, torch::jit::Value *x,
                       std::vector<torch::jit::Value *> *p_indices) {
  auto &indices = *p_indices;
  auto shape = shapeFromTensor(x);

  std::size_t pad = 0;
  std::vector<std::int64_t> index_shape;
  bool indexed = false;
  bool pad_front = true;
  // Calculate the final index size with which the gather operation will be
  // performed
  for (torch::jit::Value *index : indices) {
    if (isNone(index)) {
      if (indexed) {
        pad_front = false;
      }
      pad++;
    } else {
      auto s = shapeFromTensor(index);
      if (s.size() > index_shape.size()) {
        index_shape = s;
      }
      indexed = true;
    }
  }
  std::size_t const index_size = index_shape.size();
  std::vector<std::int64_t> flat_indices_shape =
      padShape(index_shape, pad, pad_front);

  std::size_t nones_indexed = 0;
  // Reshape each tensor into shape broadcastable with final output shape
  for (std::size_t i = 0; i < indices.size(); i++) {
    if (isNone(indices[i])) {
      // Optional tensors: 'None' means indexing over entire dimension
      // Replace each None tensor with its explicit index representation
      std::vector<std::int64_t> idx(shape[i]);
      std::iota(idx.begin(), idx.end(), 0);

      std::vector<std::int64_t> new_shape(index_size + pad, 1);
      auto final_shape_index =
          pad_front ? nones_indexed : index_size + nones_indexed;
      new_shape[final_shape_index] = shape[i];
      flat_indices_shape[final_shape_index] = shape[i];
      nones_indexed++;

      indices[i] =
          createReshape(graph, intVectorToIrConstant(graph, idx), new_shape)
              ->output();
    } else {
      const auto original_shape = shapeFromTensor(indices[i]);
      const std::vector<std::int64_t> new_shape =
          padShape(original_shape, pad, pad_front);

      indices[i] = createReshape(graph, indices[i], new_shape)->output();
    }
  }

  auto *flat_indices = indices[indices.size() - 1];
  std::int64_t stride = shape[indices.size() - 1];
  // Calculate indices within partially flattened shape
  // Tensors are automatically broadcast to the correct shape during calculation
  for (auto i = 1u; i < indices.size(); i++) {
    auto *index = indices[indices.size() - i - 1];
    auto *offset =
        createMul(graph, {index, wrapInConstant1D(graph, stride)})->output();
    flat_indices = createAdd(graph, {flat_indices, offset})->output();
    stride *= shape[indices.size() - i - 1];
  }
  // Retain the shape for downstream calculation
  flat_indices =
      createReshape(graph, flat_indices, flat_indices_shape)->output();

  std::vector<std::int64_t> flatten_shape = {-1};
  std::copy_n(shape.begin() + indices.size(), shape.size() - indices.size(),
              std::back_inserter(flatten_shape));
  // Flatten the tensor being indexed into [-1, u1, u2, ..., uN] where
  // each u is a dimension not being indexed into

  const int64_t num_elems = std::accumulate(shape.cbegin(), shape.cend(), 1,
                                            std::multiplies<int64_t>());

  auto *flatten =
      createReshape(graph, x, at::infer_size(flatten_shape, num_elems));

  return {flatten->output(), flat_indices};
}

torch::jit::Node *indexHandler(torch::jit::Graph *graph,
                               torch::jit::Node *node) {
  // aten::index(Tensor self, Tensor?[] indices)
  torch::jit::Value *x = node->input(0);
  std::vector<torch::jit::Value *> indices =
      handleTensorList(node->input(1)->node());

  const IndexInfo info = processIndex(graph, x, &indices);
  // Gather in first dimension using calculated indices into partially flattened
  // tensor
  return createGather(graph, {info.x_partial_flat, info.indices_partial_flat},
                      0);
}

bool isMaskedAssign(torch::jit::Graph *graph, torch::jit::Value *x,
                    std::vector<torch::jit::Value *> &indices) {
  // Masked fill only takes one index tensor which is broadcastable
  // with the input
  if (indices.size() != 1) {
    return false;
  }

  auto index = indices[0]->type()->expect<c10::TensorType>();
  ERROR_ON(!index->scalarType().has_value());
  auto dtype = index->scalarType().value();

  // Masks must be of type bool or byte
  if (dtype != c10::ScalarType::Bool && dtype != c10::ScalarType::Byte) {
    return false;
  }

  auto mask_shape = shapeFromTensor(indices[0]);
  auto x_shape = shapeFromTensor(x);

  // popart::where expects a bool tensor mask so cast if necessary
  if (dtype == c10::ScalarType::Byte) {
    indices[0] = createCast(graph, indices[0], c10::ScalarType::Bool)->output();
  }

  // Pad indices to enable broadcasting
  if (mask_shape.size() < x_shape.size()) {
    mask_shape.resize(x_shape.size(), 1);
    indices[0] = createReshape(graph, indices[0], mask_shape)->output();
  }
  return true;
}

std::optional<std::int32_t>
canVectorizeInDim(std::vector<torch::jit::Value *> &indices) {
  std::optional<std::int32_t> dim;
  std::int32_t const num_indices = static_cast<std::int32_t>(indices.size());

  for (std::int32_t i = 0; i < num_indices; i++) {
    if (isNone(indices[i])) {
      continue;
    }

    if (dim) {
      // Already found a valid dim but additional indices are specified so
      // cannot vectorise this case.
      return std::nullopt;
    }

    auto idx = indices[i]->type()->expect<c10::TensorType>();
    ERROR_ON(!idx->scalarType().has_value());
    auto dtype = idx->scalarType().value();

    if (!isIntegralType(dtype, false)) {
      return std::nullopt;
    }

    if (idx->dim() != 1 || idx->numel() == 1) {
      return std::nullopt;
    }

    dim = i;
  }

  return dim;
}

void applyInplaceSlice(torch::jit::Node *node, torch::jit::Node *out) {
  // If we're performing an index_put on a slice - this should operate
  // "in-place"
  //
  // Slices are tensor views in torch, and index_put_ should modify the tensor
  // being sliced. To simulate in-place modification to slices, we replace all
  // uses of the tensor being sliced with the output of this operation
  torch::jit::Value *x = node->input(0);

  if (x->node()->kind() == symbols::popart::slice) {
    auto *slice_input = x->node()->input(0);
    // Recursively follow the chain of slices until we find the original tensor
    // actually being sliced
    while (slice_input->node()->kind() == symbols::popart::slice) {
      slice_input = slice_input->node()->input(0);
    }
    slice_input->replaceAllUsesAfterNodeWith(node, out->output());
  }
}

torch::jit::Node *indexPutHandler(torch::jit::Graph *graph,
                                  torch::jit::Node *node) {
  // aten::index_put(Tensor self, Tensor?[] indices, Tensor value, bool
  //                  accumulate)
  torch::jit::Value *x = node->input(0);
  std::vector<torch::jit::Value *> indices =
      handleTensorList(node->input(1)->node());
  torch::jit::Value *v = node->input(2);

  if (isMaskedAssign(graph, x, indices)) {
    return createWhere(graph, {indices[0], v, x});
  }

  const auto fn_gen_none = [graph]() {
    torch::jit::Value *none = graph->create(c10::prim::Constant)->output();
    none->setType(c10::NoneType::get());
    return none;
  };
  const auto shape = shapeFromTensor(x);
  const auto vectorized_dim = canVectorizeInDim(indices);
  const auto v_shape = shapeFromTensor(v);
  if (vectorized_dim) {
    logging::trace(
        "Using vectorized ScatterReduce with none reduction in dim {}",
        *vectorized_dim);
    // Expand the value tensor to match the input if necessary
    if (v_shape.size() < shape.size()) {
      auto new_shape = shape;
      // In the vectorised case, the index will always be a 1D tensor
      new_shape[*vectorized_dim] = shapeFromTensor(indices[*vectorized_dim])[0];
      v = createExpand(graph, {v, intVectorToIrConstant(graph, new_shape)})
              ->output();
    }
    static constexpr auto none_reduce =
        static_cast<std::int32_t>(ScatterReduction::None);
    static constexpr bool enable_index_broadcast = true;
    auto *out = createScatterreduce(graph, {v, indices[*vectorized_dim], x},
                                    shape[0], *vectorized_dim,
                                    enable_index_broadcast, none_reduce);
    applyInplaceSlice(node, out);
    return out;
  }

  // ONNX Scatter cannot assign entire dimensions, only individual elements, so
  // we must pad the end of indices with NoneTypes so that the entire input is
  // flattened during indexing
  std::generate_n(std::back_inserter(indices), shape.size() - indices.size(),
                  fn_gen_none);

  IndexInfo info = processIndex(graph, x, &indices);

  auto indices_shape = shapeFromTensor(info.indices_partial_flat);
  auto indices_size =
      std::accumulate(indices_shape.begin(), indices_shape.end(), 1,
                      std::multiplies<std::int64_t>{});

  // Ensure value tensor can be broadcast with indexing result
  if (v_shape.size() < indices_shape.size()) {
    v = createReshape(graph, v, {1, -1})->output();
    auto v_size = std::accumulate(v_shape.begin(), v_shape.end(), 1,
                                  std::multiplies<std::int64_t>{});
    // Repeat v to match indices shape
    v = createExpand(graph, {v, intVectorToIrConstant(
                                    graph, {indices_size / v_size, 1})})
            ->output();
  }
  info.indices_partial_flat =
      createReshape(graph, info.indices_partial_flat, {indices_size})->output();
  v = createReshape(graph, v, {indices_size})->output();

  // Scatter in first dimension using calculated indices into fully flattened
  // tensor
  auto *scatter = createScatterElements(
      graph, {info.x_partial_flat, info.indices_partial_flat, v}, 0);
  // Restore original input shape
  auto *out = createReshape(graph, scatter->output(), shape);
  applyInplaceSlice(node, out);
  return out;
}

torch::jit::Node *indexFillHandler(torch::jit::Graph *graph,
                                   torch::jit::Node *node) {
  // aten::index_fill.int_Scalar(Tensor self, int dim, Tensor index, Scalar
  // value) -> Tensor aten::index_fill.int_Tensor(Tensor self, int dim, Tensor
  // index, Tensor value) -> Tensor
  auto *self = node->input(0);
  auto dim = constantToLong(node->input(1)->node());
  auto *index = node->input(2);
  auto *value = node->input(3);
  auto self_dtype = getNodeScalarType(self);
  if (getNodeScalarType(value) != self_dtype) {
    value = createCast(graph, value, self_dtype)->output();
  }

  // Create Tensor?[] indices, where indices[dim] = index, and indices[d] =
  // None, where d < dim
  std::vector<torch::jit::Value *> indices;
  auto fn_gen_none = [graph]() {
    auto *none = graph->createNone();
    insertNodeInGraph(graph, none);
    return none->output();
  };
  std::generate_n(std::back_inserter(indices), dim, fn_gen_none);
  indices.push_back(index);
  auto *list = createAndInsertNode(graph, c10::prim::ListConstruct, indices);
  auto *accumulate = createConstantInt(graph, {0}, {});

  // Re-use index_put handler
  auto index_put_handler = getHandler(c10::aten::index_put);
  return createHandlerOperation(
      graph, index_put_handler,
      {self, list->output(), value, accumulate->output()});
}

} // namespace

__attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() {
  registerHandler(c10::aten::index, indexHandler);
  registerHandler(c10::aten::index_put, indexPutHandler);
  registerHandler(c10::aten::index_fill, indexFillHandler);
}

} // namespace poptorch


================================================
FILE: poptorch/source/popart_canonicalization/LossOps.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#include "../PoptorchStaticInit.hpp"
#include "PopartCanonicalizationUtils.hpp"

#include "../PoptorchSymbols.hpp"
#include "poptorch/DispatchTracer.hpp"
#include "poptorch/OpBuilder.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

namespace poptorch {
namespace {
torch::jit::Node *binaryCrossEntropyHandler(torch::jit::Graph *graph,
                                            torch::jit::Node *node) {
  // aten::binary_cross_entropy(Tensor input, Tensor target,
  //                            Tensor? weight, int reduction)

  // L = loss, w = weight, y= target, x = input.
  // Algorithm is: L = - w * (y *log(x) + (1 - y)*log(1 - x))

  // The input.
  torch::jit::Value *x = node->input(0);

  // The target.
  torch::jit::Value *y = node->input(1);

  // Optional weight term.
  torch::jit::Value *weight = node->input(2);

  // Loss reduction.
  std::int64_t reduction = constantToLong(node->input(3)->node());

  // Convert to popart reduce values.
  reduction = convertReduceToPopart(reduction);

  // Add the one constant
  torch::jit::Node *one = createConstantFloatLike(graph, x, {1.0}, {});

  torch::jit::Node *log_x = createLog(graph, {x});

  // Log(x)*y
  torch::jit::Node *log_x_mul_y = createMul(graph, {y, log_x->output()});

  // Do (1 - y) and (1 - x)
  torch::jit::Node *x_minus_one = createSub(graph, {one->output(), x});
  torch::jit::Node *y_minus_one = createSub(graph, {one->output(), y});

  // Log(1 - x)
  torch::jit::Node *log_x_minus_one = createLog(graph, {x_minus_one->output()});

  // (1 -y)*Log(1 - x)
  torch::jit::Node *subs_multiplied =
      createMul(graph, {y_minus_one->output(), log_x_minus_one->output()});

  // Log(x)*y + (1 -y)*Log(1 - x)
  torch::jit::Node *add_terms =
      createAdd(graph, {log_x_mul_y->output(), subs_multiplied->output()});

  torch::jit::Node *final_node = add_terms;

  if (weight->node()->kind() != c10::prim::Constant) {
    final_node = createMul(graph, {add_terms->output(), weight});
  }

  final_node = createNeg(graph, {final_node->output()});

  return createIdentityloss(graph, {final_node->output()}, reduction);
}

torch::jit::Node *nllLossNdHandler(torch::jit::Graph *graph,
                                   torch::jit::Node *node) {
  // "aten::nll_loss2d(Tensor input, Tensor target, Tensor height, Tensor
  // weight, int reduction, int ignore_index) -> Tensor"

  // aten::nll_loss2d_forward(Tensor self, Tensor target, Tensor? weight, int
  // reduction, int ignore_index) -> (Tensor output, Tensor total_weight)

  // aten::nll_loss2d() is implemented based on popart:nllloss().
  // Suppose the input[0] has the shape of (N, C, M, K)
  // input[0] will be transposed with perm [0, 2, 3, 1],
  //   and reshaped with (N * M * K, C), pushing C to the last dimension.
  // input[1] will be reshaped to (N * M * K), before calling nllloss.
  // The generated IRs are as follows:
  // %37 : Tensor = popart::transpose[perm=[0, 2, 3, 1]](%35)
  // %38 : Tensor(500:4, 4:1) = popart::reshape_static_shape[shape=[500,4]](%37)
  // %39 : Int(500:1) = popart::reshape_static_shape[shape=[500]](%25)
  // %40 : Float() = popart::nllloss[reduction=1, ignoreIndex=-100](%38, %39)

  // aten::nll_loss_nd(Tensor input, Tensor target, Tensor? weight, int
  // reduction, int ignore_index) -> Tensor
  std::int64_t reduction = constantToLong(node->input(3)->node());
  std::int64_t ignore_index = constantToLong(node->input(4)->node());

  reduction = convertReduceToPopart(reduction);

  torch::jit::Value *input = node->input(0);
  torch::jit::Value *target = node->input(1);
  torch::jit::Value *weight = node->input(2);
  // TODO(T42695): Support optional weight parameter
  ERROR_ON_MSG(!isNone(weight),
               "Parameter \"weight\" is unsupported for aten::nll_loss_nd");
  std::vector<std::int64_t> shape_input = shapeFromTensor(input);
  std::vector<std::int64_t> shape_target = shapeFromTensor(target);

  if (shape_input.size() != 2) {
    // Input shape: (N, C, d1, d2, ..., dk)
    // Target shape: (N, d1, d2, ..., dk)

    // Suppose the input has the shape of (N, C, M, K)
    // The input will be transposed with perm [0, 2, 3, 1],
    //   and reshaped with (N * M * K, C), pushing C to the last dimension.
    // The target will be reshaped to (N * M * K), before calling nllloss

    std::int64_t c = shape_input[1];
    std::int64_t flat =
        std::accumulate(shape_target.begin(), shape_target.end(), 1,
                        std::multiplies<int64_t>{});

    // Create an input permutation of (0, 2, 3, ..., N, 1)
    std::vector<int64_t> p(shape_input.size(), 0);
    std::iota(p.begin() + 1, p.end() - 1, 2);
    p[p.size() - 1] = 1;

    // Permute the class dimension to the end
    torch::jit::Node *perm = createTranspose(graph, {input}, p);

    input = createReshape(graph, perm->output(), {flat, c})->output();
    target = createReshape(graph, target, {flat})->output();
  }

  torch::jit::Node *loss =
      createNllloss(graph, {input, target}, reduction, ignore_index,
                    /*inputIsLogProbability=*/true);

  if (reduction == 2) {
    // If "none" reduction, return the results with target's original shape
    loss = createReshape(graph, loss->output(), shape_target);
  }

  return createIdentityloss(graph, {loss->output()}, reduction);
}

torch::jit::Node *crossEntropyLossHandler(torch::jit::Graph *graph,
                                          torch::jit::Node *node) {
  // aten::cross_entropy_loss(Tensor self, Tensor target, Tensor? weight, int
  // reduction, int ignore_index)
  auto *input = node->input(0);
  auto *target = node->input(1);
  auto *weight = node->input(2);
  // TODO(T42695): Support optional weight parameter
  ERROR_ON_MSG(
      !isNone(weight),
      "Parameter \"weight\" is unsupported for aten::cross_entropy_loss");
  auto *reduction = node->input(3);
  auto *ignore_index = node->input(4);

  auto log_softmax_handler = getHandler(c10::aten::log_softmax);
  auto *log_softmax = createHandlerOperation(
      graph, log_softmax_handler, {input, wrapInConstant1D(graph, 1)});
  // logSoftmaxHandler loses shape information required by nllLossNdHandler,
  // so we need to set the type to that of the input, as the type will be the
  // same
  log_softmax->output()->setType(input->type());

  return createHandlerOperation(
      graph, nllLossNdHandler,
      {log_softmax->output(), target, weight, reduction, ignore_index});
}

torch::jit::Node *klDivHandler(torch::jit::Graph *graph,
                               torch::jit::Node *node) {
  // aten::kl_div(Tensor self, Tensor target, int reduction, bool log_target)

  // Input
  torch::jit::Value *x = node->input(0);
  // Target
  torch::jit::Value *y = node->input(1);
  std::int64_t reduction = constantToLong(node->input(2)->node());
  // Convert to popart reduce values
  reduction = convertReduceToPopart(reduction);
  // Whether the target is passed as log-probabilities
  bool log_target = constantToBool(node->input(3)->node());

  // log(y)
  torch::jit::Value *log_y;
  // Handle log-space targets at this stage
  if (log_target) {
    log_y = y;
    y = createExp(graph, {y})->output();
  } else {
    log_y = createLog(graph, {y})->output();
  }

  // log(y) - x
  torch::jit::Node *log_y_minus_x = createSub(graph, {log_y, x});

  // y(log(y) - x)
  torch::jit::Node *y_log_y_minus_x =
      createMul(graph, {y, log_y_minus_x->output()});

  // Handle any log(y) where y<=0 from earlier
  torch::jit::Node *zeros = createConstantFloatLike(graph, y, {0}, {});
  torch::jit::Node *mask = createGreater(graph, {y, zeros->output()});
  torch::jit::Node *final_node = createWhere(
      graph, {mask->output(), y_log_y_minus_x->output(), zeros->output()});

  return createIdentityloss(graph, {final_node->output()}, reduction);
}

torch::jit::Node *poissonNllLossHandler(torch::jit::Graph *graph,
                                        torch::jit::Node *node) {
  // aten::poisson_nll_loss(Tensor input, Tensor target, bool log_input,
  //                        bool full, float eps, int reduction)

  // Input
  torch::jit::Value *x = node->input(0);
  // Target
  torch::jit::Value *y = node->input(1);
  // Whether the input is passed as log-probabilities
  bool log_input = constantToBool(node->input(2)->node());
  // Whether to compute full loss using Stirling approximation
  bool full = constantToBool(node->input(3)->node());
  // Added to avoid log(0) when log_input == false
  torch::jit::Value *epsilon = node->input(4);

  std::int64_t reduction = constantToLong(node->input(5)->node());
  // Convert to popart reduce values
  reduction = convertReduceToPopart(reduction);

  // log(x)
  torch::jit::Value *log_x;
  // Handle log-space inputs at this stage
  if (log_input) {
    log_x = x;
    x = createExp(graph, {x})->output();
  } else {
    torch::jit::Value *x_plus_eps = createAdd(graph, {x, epsilon})->output();
    log_x = createLog(graph, {x_plus_eps})->output();
  }

  // y log(x)
  torch::jit::Node *y_mul_log_x = createMul(graph, {y, log_x});

  // x - y log(x)
  torch::jit::Node *final_node = createSub(graph, {x, y_mul_log_x->output()});

  // Stirling approximation term = y log(y) -y + 0.5 log(2*PI*y)
  if (full) {
    // log(y)
    torch::jit::Node *log_y = createLog(graph, {y});
    // y log(y)
    torch::jit::Node *y_mul_log_y = createMul(graph, {y, log_y->output()});
    // y log(y) - y
    torch::jit::Node *minus_y = createSub(graph, {y_mul_log_y->output(), y});

    // 2*PI
    torch::jit::Node *two_pi =
        createConstantFloatLike(graph, x, {2 * M_PI}, {});
    // 2*PI*y
    torch::jit::Node *two_pi_y = createMul(graph, {two_pi->output(), y});
    // log(2*PI*y)
    torch::jit::Node *log_two_pi_y = createLog(graph, {two_pi_y->output()});
    // 0.5
    torch::jit::Node *half = createConstantFloatLike(graph, x, {0.5}, {});
    // 0.5 log(2*PI*y)
    torch::jit::Node *mul_half =
        createMul(graph, {half->output(), log_two_pi_y->output()});

    // y log(y) - y + 0.5 log(2*PI*y)
    torch::jit::Node *add =
        createAdd(graph, {minus_y->output(), mul_half->output()});

    // Approximation values only added for target values > 1
    std::vector<std::int64_t> shape = shapeFromTensor(y);
    torch::jit::Node *ones = createConstantFloatLike(graph, x, {1}, shape);
    torch::jit::Node *mask = createGreater(graph, {y, ones->output()});
    torch::jit::Node *zeros = createConstantFloatLike(graph, x, {0}, shape);
    torch::jit::Node *masked_fill =
        createWhere(graph, {mask->output(), add->output(), zeros->output()});

    // x - y log(x) + y log(y) - y + 0.5 log(2*PI*y)
    final_node =
        createAdd(graph, {final_node->output(), masked_fill->output()});
  }

  return createIdentityloss(graph, {final_node->output()}, reduction);
}

torch::jit::Node *bceWithLogitsHandler(torch::jit::Graph *graph,
                                       torch::jit::Node *node) {
  // aten::binary_cross_entropy_with_logits(Tensor input, Tensor target,
  //                                        Tensor? weight, Tensor? pos_weight,
  //                                        int reduction)

  // Input
  torch::jit::Value *x = node->input(0);
  // Target
  torch::jit::Value *y = node->input(1);

  // Weight
  torch::jit::Value *w = node->input(2);
  // Weight of positive examples
  torch::jit::Value *pos_w = node->input(3);

  std::int64_t reduction = constantToLong(node->input(4)->node());
  // Convert to popart reduce values
  reduction = convertReduceToPopart(reduction);

  // -x
  torch::jit::Node *loss = createNeg(graph, {x});

  // 0
  torch::jit::Node *zeros = createConstantFloatLike(graph, x, {0}, {});

  // m = max(-x, 0)
  torch::jit::Node *m = createMax(graph, {loss->output(), zeros->output()});

  // -x - m
  loss = createSub(graph, {loss->output(), m->output()});
  // exp(-x - m)
  loss = createExp(graph, {loss->output()});

  // -m
  torch::jit::Node *neg_m = createNeg(graph, {m->output()});
  // exp(-m)
  torch::jit::Node *exp_neg_m = createExp(graph, {neg_m->output()});

  // exp(-m) + exp(-x - m)
  loss = createAdd(graph, {exp_neg_m->output(), loss->output()});
  // log(exp(-m) + exp(-x - m))
  loss = createLog(graph, {loss->output()});
  // m + log(exp(-m) + exp(-x - m))
  loss = createAdd(graph, {m->output(), loss->output()});

  // 1
  torch::jit::Node *ones = createConstantFloatLike(graph, x, {1}, {});

  // if pos_weight is specified
  if (!isNone(pos_w)) {
    // p - 1
    torch::jit::Node *p_minus_one = createSub(graph, {pos_w, ones->output()});
    // (p - 1) y
    torch::jit::Node *p_minus_one_mul_y =
        createMul(graph, {p_minus_one->output(), y});
    // l_p = (p - 1) y + 1
    torch::jit::Node *l_p =
        createAdd(graph, {p_minus_one_mul_y->output(), ones->output()});

    // l_p (m + log(exp(-m) + exp(-x - m)))
    loss = createMul(graph, {l_p->output(), loss->output()});
  }

  // (1 - y)
  torch::jit::Node *one_minus_y = createSub(graph, {ones->output(), y});
  // (1 - y) x
  torch::jit::Node *mul_x = createMul(graph, {one_minus_y->output(), x});
  // (1 - y) x + l_p (m + log(exp(-m) + exp(-x - m)))
  loss = createAdd(graph, {mul_x->output(), loss->output()});

  // if weight is specified
  if (!isNone(w)) {
    // w [(1 - y) x + l_p (m + log(exp(-m) + exp(-x - m)))]
    loss = createMul(graph, {w, loss->output()});
  }

  return createIdentityloss(graph, {loss->output()}, reduction);
}

// TODO(T30688): Unsupported since the PyTorch implementation doesn't
//               currently use this aten function
torch::jit::Node *multiLabelSoftMarginLossHandler(torch::jit::Graph *graph,
                                                  torch::jit::Node *node) {
  // aten::multilabel_soft_margin_loss(Tensor input, Tensor target,
  //                                   Tensor? weight, int reduction)

  // Input
  torch::jit::Value *x = node->input(0);
  // Target
  torch::jit::Value *y = node->input(1);
  // Weight
  torch::jit::Value *w = node->input(2);

  std::int64_t reduction = constantToLong(node->input(3)->node());
  // Convert to popart reduce values
  reduction = convertReduceToPopart(reduction);

  auto log_sigmoid_handler = getHandler(c10::aten::log_sigmoid);

  // -x
  torch::jit::Node *loss = createNeg(graph, {x});
  // log(sigmoid(-x))
  loss = createHandlerOperation(graph, log_sigmoid_handler, {loss->output()});

  // 1
  torch::jit::Node *ones = createConstantFloatLike(graph, x, {1}, {});
  // 1 - y
  torch::jit::Node *one_minus_y = createSub(graph, {ones->output(), y});

  // (1 - y) log(sigmoid(-x))
  loss = createMul(graph, {one_minus_y->output(), loss->output()});

  // log(sigmoid(x))
  torch::jit::Node *log_sig_x =
      createHandlerOperation(graph, log_sigmoid_handler, {x});
  // y log(sigmoid(x))
  torch::jit::Node *y_mul_log_sig_x =
      createMul(graph, {y, log_sig_x->output()});

  // y log(sigmoid(x)) + (1 - y) log(sigmoid(-x))
  loss = createAdd(graph, {y_mul_log_sig_x->output(), loss->output()});
  // -(y log(sigmoid(x)) + (1 - y) log(sigmoid(-x)))
  loss = createNeg(graph, {loss->output()});

  // if weight is specified
  if (!isNone(w)) {
    // -w (y log(sigmoid(x)) + (1 - y) log(sigmoid(-x)))
    loss = createMul(graph, {w, loss->output()});
  }

  return createIdentityloss(graph, {loss->output()}, reduction);
}

torch::jit::Node *cosineEmbeddingLossHandler(torch::jit::Graph *graph,
                                             torch::jit::Node *node) {
  // aten::cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target,
  //                             float margin, int reduction)

  // Input 1
  torch::jit::Value *x1 = node->input(0);
  // Input 2
  torch::jit::Value *x2 = node->input(1);
  // Target
  torch::jit::Value *y = node->input(2);
  // Margin
  torch::jit::Value *margin = node->input(3);

  std::int64_t reduction = constantToLong(node->input(4)->node());
  // Convert to popart reduce values
  reduction = convertReduceToPopart(reduction);

  // Epsilon
  torch::jit::Value *epsilon =
      createConstantFloatLike(graph, x1, {1e-12}, {})->output();

  // x1 * x2
  torch::jit::Node *x1_mul_x2 = createMul(graph, {x1, x2});
  // sum(x1 * x2)
  torch::jit::Node *sum_x1_mul_x2 =
      createReducesum(graph, {x1_mul_x2->output()}, {1}, 0);

  // sum_sqr(x1)
  torch::jit::Node *sum_sqr_x1 = createReducesumsquare(graph, {x1}, {1}, 0);
  // sq1 = sum_sqr(x1) + eps
  torch::jit::Node *sum_sqr_x1_plus_eps =
      createAdd(graph, {sum_sqr_x1->output(), epsilon});

  // sum_sqr(x2)
  torch::jit::Node *sum_sqr_x2 = createReducesumsquare(graph, {x2}, {1}, 0);
  // sq2 = sum_sqr(x2) + eps
  torch::jit::Node *sum_sqr_x2_plus_eps =
      createAdd(graph, {sum_sqr_x2->output(), epsilon});

  // sq1 * sq1
  torch::jit::Node *sq1_mul_sq2 = createMul(
      graph, {sum_sqr_x1_plus_eps->output(), sum_sqr_x2_plus_eps->output()});
  // sqrt(sq1 * sq2)
  torch::jit::Node *sqrt_sq1_mul_sq2 =
      createSqrt(graph, {sq1_mul_sq2->output()});

  // cos_sim(x1, x2)
  torch::jit::Node *cos_sim =
      createDiv(graph, {sum_x1_mul_x2->output(), sqrt_sq1_mul_sq2->output()});

  // 1
  torch::jit::Node *ones = createConstantFloatLike(graph, x1, {1}, {});
  // 1 - cos_sim(x1, x2)
  torch::jit::Node *one_minus_cos_sim =
      createSub(graph, {ones->output(), cos_sim->output()});

  // cos_sim(x1, x2) - margin
  torch::jit::Node *cos_sim_minus_margin =
      createSub(graph, {cos_sim->output(), margin});
  // 0
  torch::jit::Node *zeros = createConstantFloatLike(graph, x1, {0}, {});
  // max(0, cos_sim(x1, x2) - margin)
  torch::jit::Node *max_zero_cos_sim_minus_margin =
      createMax(graph, {zeros->output(), cos_sim_minus_margin->output()});

  // -1
  torch::jit::Node *neg_ones = createConstantInt(graph, {-1}, {});
  // if y = 1
  torch::jit::Node *ones_mask = createEqual(graph, {y, ones->output()});
  // if y = -1
  torch::jit::Node *neg_ones_mask = createEqual(graph, {y, neg_ones->output()});

  // l = 1 - cos(x1, x2)               if y = 1
  torch::jit::Node *ones_masked_fill =
      createWhere(graph, {ones_mask->output(), one_minus_cos_sim->output(),
                          zeros->output()});
  // l = max(0, cos(x1, x2) - margin)  if y = -1
  torch::jit::Node *neg_ones_masked_fill = createWhere(
      graph, {neg_ones_mask->output(), max_zero_cos_sim_minus_margin->output(),
              zeros->output()});

  torch::jit::Node *loss = createAdd(
      graph, {ones_masked_fill->output(), neg_ones_masked_fill->output()});

  return createIdentityloss(graph, {loss->output()}, reduction);
}

torch::jit::Node *tripletMarginLossHandler(torch::jit::Graph *graph,
                                           torch::jit::Node *node) {
  // aten::triplet_margin_loss(Tensor anchor, Tensor positive, Tensor negative,
  //                           float margin, float p, float eps, bool swap, int
  //                           reduction)

  // Anchor
  torch::jit::Value *a = node->input(0);
  // Positive
  torch::jit::Value *pos = node->input(1);
  // Negative
  torch::jit::Value *neg = node->input(2);
  // Margin
  torch::jit::Value *margin = node->input(3);
  // Norm degree for pairwise distance
  torch::jit::Value *p = node->input(4);
  // Small value to avoid division by zero
  torch::jit::Value *eps = node->input(5);
  // Swap
  bool swap = constantToBool(node->input(6)->node());

  // keepdim = false
  torch::jit::Value *keepdim = createConstantInt(graph, {0}, {})->output();

  std::int64_t reduction = constantToLong(node->input(7)->node());
  // Convert to popart reduce values
  reduction = convertReduceToPopart(reduction);

  // pairwiseDistanceHandler
  auto pairwise_dist_handler = getHandler(c10::aten::pairwise_distance);

  // d(a, pos)
  torch::jit::Node *loss = createHandlerOperation(graph, pairwise_dist_handler,
                                                  {a, pos, p, eps, keepdim});
  // d(a, neg)
  torch::jit::Node *dist_neg = createHandlerOperation(
      graph, pairwise_dist_handler, {a, neg, p, eps, keepdim});

  if (swap) {
    torch::jit::Node *dist_swap = createHandlerOperation(
        graph, pairwise_dist_handler, {pos, neg, p, eps, keepdim});
    // d(a, neg) = min(d(a, neg), d(pos, neg))
    dist_neg = createMin(graph, {dist_neg->output(), dist_swap->output()});
  }

  // d(a, pos) - d(a, neg)
  loss = createSub(graph, {loss->output(), dist_neg->output()});
  // d(a, pos) - d(a, neg) + margin
  loss = createAdd(graph, {loss->output(), margin});

  torch::jit::Node *zeros = createConstantFloatLike(graph, a, {0}, {});
  // max(d(a, pos) - d(a, neg) + margin, 0)
  loss = createMax(graph, {loss->output(), zeros->output()});

  return createIdentityloss(graph, {loss->output()}, reduction);
}

torch::jit::Node *ctcLossHandler(torch::jit::Graph *graph,
                                 torch::jit::Node *node) {
  auto *log_probs = node->input(0);
  auto *targets = node->input(1);
  auto *input_lengths = node->input(2);
  auto *target_lengths = node->input(3);
  auto blank = constantToInt(node->input(4)->node());
  auto reduction = constantToLong(node->input(5)->node());
  auto zero_inf = constantToBool(node->input(6)->node());

  ERROR_ON_MSG(reduction == 0,
               "CTCLoss with reduction=\"none\" is currently not supported");

  targets = createCast(graph, {targets}, "UINT32")->output();
  if (input_lengths->type()->kind() == c10::TypeKind::TensorType) {
    // aten::ctc_loss.Tensor
    input_lengths = createCast(graph, {input_lengths}, "UINT32")->output();
  } else {
    // aten::ctc_loss.IntList: convert to tensor for popart::_ctcloss
    const auto values = constantToLongVec(input_lengths->node());
    const std::int64_t shape = values.size();
    input_lengths = createConstantInt(graph, values, {shape})->output();
  }

  if (target_lengths->type()->kind() == c10::TypeKind::TensorType) {
    // aten::ctc_loss.Tensor
    target_lengths = createCast(graph, {target_lengths}, "UINT32")->output();
  } else {
    // aten::ctc_loss.IntList: convert to tensor for popart::_ctcloss
    const auto values = constantToLongVec(target_lengths->node());
    const std::int64_t shape = values.size();
    target_lengths = createConstantInt(graph, values, {shape})->output();
  }

  reduction = convertReduceToPopart(reduction);
  auto *loss =
      create_ctcloss(graph, {log_probs, targets, input_lengths, target_lengths},
                     reduction, blank, "UNDEFINED", zero_inf);

  return createIdentityloss(graph, {loss->output()}, reduction);
}

torch::jit::Node *ctcbeamsearchdecoderHandler(torch::jit::Graph *graph,
                                              torch::jit::Node *node) {
  auto *log_probs = node->input(0);
  auto *lengths = node->input(1);
  auto blank = constantToInt(node->input(2)->node());
  auto width = constantToInt(node->input(3)->node());
  auto top_paths = constantToInt(node->input(4)->node());

  lengths = createCast(graph, {lengths}, "UINT32")->output();
  auto *decoder = createCtcbeamsearchdecoder(graph, {log_probs, lengths}, blank,
                                             width, top_paths);
  decoder->addOutput();
  decoder->addOutput();

  node->output(0)->replaceAllUsesWith(decoder->output(0));
  node->output(1)->replaceAllUsesWith(decoder->output(1));
  node->output(2)->replaceAllUsesWith(decoder->output(2));

  markNodeForDeletion(node);
  return decoder;
}

} // namespace

__attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() {
  registerHandler(c10::aten::nll_loss2d, nllLossNdHandler);
  registerHandler(c10::aten::nll_loss2d_forward, nllLossNdHandler);
  registerHandler(c10::aten::nll_loss_nd, nllLossNdHandler);
  registerHandler(c10::aten::nll_loss_forward, nllLossNdHandler);
  registerHandler(c10::aten::binary_cross_entropy, binaryCrossEntropyHandler);
  registerHandler(c10::aten::kl_div, klDivHandler);
  registerHandler(c10::aten::poisson_nll_loss, poissonNllLossHandler);
  registerHandler(c10::aten::binary_cross_entropy_with_logits,
                  bceWithLogitsHandler);
  registerHandler(c10::aten::multilabel_soft_margin_loss,
                  multiLabelSoftMarginLossHandler);
  registerHandler(c10::aten::cosine_embedding_loss, cosineEmbeddingLossHandler);
  registerHandler(c10::aten::triplet_margin_loss, tripletMarginLossHandler);
  registerHandler(c10::aten::ctc_loss, ctcLossHandler);
  registerHandler(symbols::poptorch::ctc_beam_search_decoder,
                  ctcbeamsearchdecoderHandler);
  registerHandler(c10::aten::cross_entropy_loss, crossEntropyLossHandler);
}

} // namespace poptorch


================================================
FILE: poptorch/source/popart_canonicalization/NormalizationOps.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#include "../PoptorchStaticInit.hpp"
#include "../PoptorchSymbols.hpp"
#include "PopartCanonicalizationUtils.hpp"

#include "poptorch/ImplicitCasting.hpp"
#include "poptorch/OpBuilder.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"

namespace poptorch {
namespace {

void initializeParamConstant(torch::jit::Graph *graph, torch::jit::Value *input,
                             torch::jit::Value **param, float value,
                             const std::vector<int64_t> &shape,
                             const std::string &norm_name,
                             const std::string &input_name,
                             bool always_f32 = false) {
  c10::ScalarType const scalar_type =
      *input->type()->expect<c10::TensorType>()->scalarType();
  switch (scalar_type) {
  case c10::ScalarType::Int: {
    *param = createConstantInt(graph, {static_cast<int64_t>(value)}, shape)
                 ->output();
    break;
  }
  case c10::ScalarType::Half:
  case c10::ScalarType::Float: {
    if (always_f32) {
      *param = createConstantFloat32(graph, {value}, shape)->output();
    } else {
      *param = createConstantFloatLike(graph, input, {value}, shape)->output();
    }
    break;
  }
  default:
    ERROR(norm_name << " input \"" << input_name << "\""
                    << " of type " << c10::toString(scalar_type)
                    << " not supported");
  }
}

// Return true if parameters are initialised by this function, otherwise return
// false
bool maybeInitializeAffineParamConstants(torch::jit::Graph *graph,
                                         torch::jit::Value *input,
                                         torch::jit::Value **weight,
                                         torch::jit::Value **bias,
                                         const std::vector<std::int64_t> &shape,
                                         const std::string &norm_name) {
  // Either both should be defined, or neither
  ERROR_ON(isNone(*weight) != isNone(*bias));
  if (!isNone(*weight)) {
    return false;
  }

  initializeParamConstant(graph, input, weight, 1, shape, norm_name, "weight");
  initializeParamConstant(graph, input, bias, 0, shape, norm_name, "bias");
  return true;
}

// Ensures running_mean and running_var tensors by creating constants if they
// are not set (None) The running_mean and running_var may be none e.g. if
// track_running_stats is set to False for the relevant PyTorch BatchNorm layer.
// To satisfy popart/onnx, create a zero input for running_mean and all ones for
// running_var
void maybeInitializeRunningParamConstants(
    torch::jit::Graph *graph, torch::jit::Value *input,
    torch::jit::Value **running_mean, torch::jit::Value **running_var,
    const std::vector<std::int64_t> &shape) {
  // Either both should be defined, or neither
  ERROR_ON(isNone(*running_mean) != isNone(*running_var));
  if (!isNone(*running_mean)) {
    return;
  }

  std::string const norm_name = "BatchNorm";
  const bool always_f32 = false;
  initializeParamConstant(graph, input, running_mean, 0, shape, norm_name,
                          "running_mean", always_f32);
  initializeParamConstant(graph, input, running_var, 1, shape, norm_name,
                          "running_var", always_f32);
}

torch::jit::Node *batchNormHandler(torch::jit::Graph *graph,
                                   torch::jit::Node *node) {
  // aten::batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor?
  // running_mean, Tensor? running_var, bool training, float momentum, float
  // eps, bool cudnn_enabled) -> Tensor

  // aten::native_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor?
  // running_mean, Tensor? running_var, bool training, float momentum, float
  // eps) -> (Tensor, Tensor, Tensor)

  // Input is value at 0th position.
  torch::jit::Value *input = node->input(0);

  auto input_shape = shapeFromTensor(input);

  torch::jit::Value *weight = node->input(1);
  torch::jit::Value *bias = node->input(2);

  castWeightAndBias(graph, input, weight, bias);

  torch::jit::Value *running_mean = node->input(3);
  torch::jit::Value *running_var = node->input(4);

  const float momentum = constantToFloat(node->input(6)->node());
  const float epsilon = constantToFloat(node->input(7)->node());

  const bool training = constantToBool(node->input(5)->node());
  const bool three_outputs = (node->kind() == c10::aten::native_batch_norm);

  const std::vector<int64_t> param_shape{input_shape[1]};

  maybeInitializeAffineParamConstants(graph, input, &weight, &bias, param_shape,
                                      "BatchNorm");

  // Use initialised constants if running_mean and running_var are none
  maybeInitializeRunningParamConstants(graph, input, &running_mean,
                                       &running_var, param_shape);

  // PyTorch supports an input size of (N, C, *) but PopART requires the spatial
  // dimension, so we must ensure an input size of (N, C, L, *)
  if (input_shape.size() == 2) {
    input = createUnsqueeze(graph, {input}, {2})->output();
  }

  // To indicate training, for BatchNormalization-9, use num_outputs = 5
  // From ONNX
  // Output case #1: Y, mean, var, saved_mean, saved_var (training mode)
  // Output case #2: Y (test mode)
  // Popart supports this with "if (output->n() > 1)"
  auto *batch_norm = createBatchnormalization(
      graph, {input, weight, bias, running_mean, running_var}, training ? 5 : 1,
      epsilon, 1.0f - momentum, training && three_outputs ? 3 : 1);

  // If the input size was of rank 2, we need to squeeze out the added dim
  if (input_shape.size() == 2) {
    batch_norm = createSqueeze(graph, {batch_norm->output(0)}, {2});
  }
  return batch_norm;
}

torch::jit::Node *layerNormHandler(torch::jit::Graph *graph,
                                   torch::jit::Node *node) {
  // aten::layer_norm(Tensor input,int[] normalized_shape, Tensor? weight,
  //                Tensor? bias, float eps, bool cudnn_enable) -> Tensor

  // aten::native_layer_norm(Tensor input, int[] normalized_shape,
  // Tensor? weight, Tensor? bias, float eps) -> (Tensor, Tensor, Tensor)
  // NB return tensors match PopART

  // Tensor to normalise.
  torch::jit::Value *input = node->input(0);

  std::vector<std::int64_t> normalized_shape =
      constantToLongVec(node->input(1)->node());

  // Weight to multiply.
  torch::jit::Value *gamma = node->input(2);
  // Bias to add.
  torch::jit::Value *beta = node->input(3);

  castWeightAndBias(graph, input, gamma, beta);

  auto numel_affine =
      std::accumulate(normalized_shape.begin(), normalized_shape.end(), 1,
                      std::multiplies<int64_t>{});
  const bool initialized = maybeInitializeAffineParamConstants(
      graph, input, &gamma, &beta, {numel_affine}, "LayerNorm");

  if (!initialized) {
    // GroupNorm takes per-channel affine parameters whereas LayerNorm takes
    // elementwise affine parameters. Therefore we first need to reshape such
    // that the affine parameters are "per-channel" which in the case of
    // LayerNorm is equivalent to flattening them
    gamma =
        createReshape(graph, gamma, {static_cast<std::int64_t>(numel_affine)})
            ->output();
    beta = createReshape(graph, beta, {static_cast<std::int64_t>(numel_affine)})
               ->output();
  }

  const float epsilon = constantToFloat(node->input(4)->node());

  // Pytorch normalizes across arbitrary number of dimensions from the end.
  // We flatten into a [M, N] array and normalize the N.

  // (In the event of using native_layer_norm, there will be three outputs.
  // Use only the first.)
  const std::vector<std::int64_t> output_shape =
      shapeFromTensor(node->output(0));

  const std::vector<std::int64_t> input_shape = shapeFromTensor(input);
  const std::int64_t axis = input_shape.size() - normalized_shape.size();

  // Flatten into [M, N]
  torch::jit::Node *flatten = createFlatten(graph, {input}, axis);

  // Normalize.
  torch::jit::Node *normalize = createGroupnormalization(
      graph, {flatten->output(), gamma, beta}, 1, epsilon);

  // Perform the reshape.
  return createReshape(graph, normalize->output(), output_shape);
}

// This handler ensures that the input to popart is 4-dimensional
torch::jit::Node *groupNormHandler(torch::jit::Graph *graph,
                                   torch::jit::Node *node) {
  // aten::group_norm(Tensor input, int num_groups, Tensor? weight, Tensor?
  //                  bias, float eps, bool cudnn_enabled)

  torch::jit::Value *input = node->input(0);

  std::int64_t const num_groups = constantToLong(node->input(1)->node());
  // Weight to multiply
  torch::jit::Value *gamma = node->input(2);
  // Bias to add
  torch::jit::Value *beta = node->input(3);

  castWeightAndBias(graph, input, gamma, beta);

  auto num_channels = shapeFromTensor(input)[1];
  maybeInitializeAffineParamConstants(graph, input, &gamma, &beta,
                                      {num_channels}, "GroupNorm");

  const float epsilon = constantToFloat(node->input(4)->node());

  return createGroupnormalization(graph, {input, gamma, beta}, num_groups,
                                  epsilon);
}

// aten::native_group_norm has a different signature to aten::group_norm
torch::jit::Node *nativeGroupNormHandler(torch::jit::Graph *graph,
                                         torch::jit::Node *node) {
  // aten::native_group_norm(Tensor input, Tensor? weight, Tensor? bias, int N,
  // int C, int HxW, int group, float eps) -> (Tensor, Tensor, Tensor)

  // Returns are (result, mean, inv_std_dev) which matches PopTorch

  torch::jit::Value *input = node->input(0);

  // Weight to multiply
  torch::jit::Value *gamma = node->input(1);
  // Bias to add
  torch::jit::Value *beta = node->input(2);

  castWeightAndBias(graph, input, gamma, beta);

  auto num_channels = shapeFromTensor(input)[1];
  maybeInitializeAffineParamConstants(graph, input, &gamma, &beta,
                                      {num_channels}, "GroupNorm");

  // N, C and HxW are redundant given that the input size must be known for
  // IPU, but provide a useful check
  auto input_shape = shapeFromTensor(input);
  ERROR_ON(input_shape[0] != constantToLong(node->input(3)->node()));
  ERROR_ON(input_shape[1] != constantToLong(node->input(4)->node()));

  auto hx_w =
      std::accumulate(input_shape.begin() + 2, input_shape.end(),
                      static_cast<int64_t>(1), std::multiplies<int64_t>());
  ERROR_ON(hx_w != constantToLong(node->input(5)->node()));

  std::int64_t const num_groups = constantToLong(node->input(6)->node());

  const float epsilon = constantToFloat(node->input(7)->node());
  return createGroupnormalization(graph, {input, gamma, beta}, num_groups,
                                  epsilon);
}

torch::jit::Node *instanceNormHandler(torch::jit::Graph *graph,
                                      torch::jit::Node *node) {
  // aten::instance_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor?
  //                     running_mean, Tensor? running_var, bool
  //                     use_input_stats, float momentum, float eps, bool
  //                     cudnn_enabled)

  // Tensor to normalise
  // Input: (N, C, L)       InstanceNorm1d
  //        (N, C, H, W)    InstanceNorm2d
  //        (N, C, D, H, W) InstanceNorm3d
  torch::jit::Value *input = node->input(0);

  // Weight to multiply
  torch::jit::Value *gamma = node->input(1);
  // Bias to add
  torch::jit::Value *beta = node->input(2);

  castWeightAndBias(graph, input, gamma, beta);

  std::int64_t const num_channels = shapeFromTensor(input)[1];

  maybeInitializeAffineParamConstants(graph, input, &gamma, &beta,
                                      {num_channels}, "InstanceNorm");

  // Group normalization does not currently allow passing a momentum value,
  // nor the running mean or running variance

  const float epsilon = constantToFloat(node->input(7)->node());

  // Normalize per channel C, so use Group normalization with C groups
  return createGroupnormalization(graph, {input, gamma, beta}, num_channels,
                                  epsilon);
}
} // namespace

__attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() {
  registerHandler(c10::aten::batch_norm, batchNormHandler);
  registerHandler(c10::aten::native_batch_norm, batchNormHandler);
  registerHandler(c10::aten::layer_norm, layerNormHandler);
  registerHandler(c10::aten::native_layer_norm, layerNormHandler);
  registerHandler(c10::aten::group_norm, groupNormHandler);
  registerHandler(c10::aten::native_group_norm, nativeGroupNormHandler);
  registerHandler(c10::aten::instance_norm, instanceNormHandler);
}

} // namespace poptorch


================================================
FILE: poptorch/source/popart_canonicalization/OtherOps.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#include <c10/core/ScalarType.h>

#include "../PoptorchStaticInit.hpp"
#include "../PoptorchSymbols.hpp"
#include "EinsumOp.hpp"
#include "PopartCanonicalizationUtils.hpp"

#include "ScatterReduction.hpp"

#include "poptorch/OpBuilder.hpp"
#include "poptorch/PopartCanonicalization.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

#include <ATen/ATen.h>

namespace poptorch {
namespace {

torch::jit::Node *bucketizeHandler(torch::jit::Graph *graph,
                                   torch::jit::Node *node) {

  // aten::bucketize.Tensor(Tensor self, Tensor boundaries, *,
  // bool out_int32=False, bool right=False) -> Tensor

  const auto args =
      poptorch::promoteTensors(graph, node->input(0), node->input(1));
  const bool right = constantToBool(node->input(3)->node());

  return createBucketize(graph, args, right);
}

torch::jit::Node *bincountHandler(torch::jit::Graph *graph,
                                  torch::jit::Node *node) {

  // aten::bincount(Tensor self, Tensor? weights=None, int minlength=0)
  // -> Tensor
  auto *input = node->input(0);
  auto *const weights_param = node->input(1);
  auto *const minlength = node->input(2);
  const int64_t axis_size = constantToLong(minlength->node());

  const auto weights_length = shapeFromTensor(input).front();
  auto *const weights =
      isNone(weights_param)
          ? createConstantInt(graph, std::vector<int64_t>(weights_length, 1),
                              {weights_length})
                ->output()
          : weights_param;

  if (getNodeScalarType(input) != c10::kInt) {
    input = createCast(graph, createFloor(graph, {input})->output(), c10::kInt)
                ->output();
  }

  auto *const condition = createLess(graph, {input, minlength})->output();
  auto *const max_index =
      createConstantInt(graph, {weights_length - 1}, {1})->output();
  input = createWhere(graph, {condition, input, max_index})->output();

  static constexpr bool enable_index_broadcast = false;
  static constexpr int64_t reduction_type =
      static_cast<std::int32_t>(ScatterReduction::Sum);
  static constexpr int64_t axis = 0;
  return createScatterreduce(graph, {weights, input}, axis_size, axis,
                             enable_index_broadcast, reduction_type);
}

torch::jit::Node *einsumHandler(torch::jit::Graph *graph,
                                torch::jit::Node *node) {
  // aten::einsum(string equation, Tensor[] tensors) -> Tensor

  // Einstein summation convention equation
  const std::string eq = constantToString(node->input(0)->node());
  // List of inputs to perform the operation on
  const std::vector<torch::jit::Value *> tensors =
      handleTensorList(node->input(1)->node());

  const std::vector<std::int64_t> output_shape =
      shapeFromTensor(node->output());
  EinsumOp einsum(eq, tensors);
  return einsum.create(graph, output_shape);
}

torch::jit::Node *meshgridHandler(torch::jit::Graph *graph,
                                  torch::jit::Node *node) {
  // aten::meshgrid(Tensor[] tensors) -> Tensor[]

  const std::vector<torch::jit::Value *> tensors =
      handleTensorList(node->input(0)->node());

  std::vector<std::int64_t> expand_shape;
  expand_shape.reserve(tensors.size());
  for (torch::jit::Value *tensor : tensors) {
    // Each tensor is 1D so the shape is just the first dim
    expand_shape.push_back(shapeFromTensor(tensor)[0]);
  }

  std::vector<torch::jit::Value *> grids;
  for (std::size_t i = 0; i < tensors.size(); i++) {
    std::vector<std::int64_t> shape(tensors.size(), 1);
    shape[i] = -1;
    // Reshape 1D tensor to rank N, N = number of tensors, such that
    // all but the ith dimension is a singleton
    torch::jit::Node *reshaped = createReshape(graph, tensors[i], shape);
    // Expand over the dimensions of all other tensors
    torch::jit::Node *expanded =
        createExpand(graph, {reshaped->output(),
                             intVectorToIrConstant(graph, expand_shape)});
    grids.push_back(expanded->output());
  }

  return createAndInsertNode(graph, at::prim::ListConstruct, grids);
}

torch::jit::Node *cartesianProdHandler(torch::jit::Graph *graph,
                                       torch::jit::Node *node) {
  // aten::cartesian_prod(Tensor[] tensors) -> Tensor

  const std::vector<torch::jit::Value *> tensors =
      handleTensorList(node->input(0)->node());

  if (tensors.size() == 1) {
    return tensors[0]->node();
  }

  auto meshgrid_handler = getHandler(c10::aten::meshgrid);
  auto stack_handler = getHandler(c10::aten::stack);

  torch::jit::Node *grids =
      createHandlerOperation(graph, meshgridHandler, {node->input(0)});

  std::vector<torch::jit::Value *> grids_vector = handleTensorList(grids);

  for (torch::jit::Value *&grid : grids_vector) {
    // Flatten into 1 x N
    torch::jit::Node *flatten = createFlatten(graph, {grid}, 0);
    // Squeeze the first dimension
    flatten = createSqueeze(graph, {flatten->output()}, {0});
    grid = flatten->output();
  }

  torch::jit::Node *grid_list =
      createAndInsertNode(graph, at::prim::ListConstruct, grids_vector);

  // Stack 1D tensors along dimension 1
  return createHandlerOperation(
      graph, stack_handler, {grid_list->output(), wrapInConstant1D(graph, 1)});
}

torch::jit::Node *tensordotHandler(torch::jit::Graph *graph,
                                   torch::jit::Node *node) {
  // aten::tensordot(Tensor self, Tensor other, int[] dims_self,
  //                 int[] dims_other) -> Tensor

  torch::jit::Value *x1 = node->input(0);
  torch::jit::Value *x2 = node->input(1);
  std::vector<std::int64_t> rdims_x1 =
      constantToLongVec(node->input(2)->node());
  const std::vector<std::int64_t> rdims_x2 =
      constantToLongVec(node->input(3)->node());

  // rdims_prod (default = 1 with no reduction)
  std::int64_t rdims_prod = 1;

  const std::vector<std::int64_t> shape_x1 = shapeFromTensor(x1);
  const std::vector<std::int64_t> shape_x2 = shapeFromTensor(x2);

  // Original permutation
  std::vector<std::int64_t> p1 = shape_x1;
  std::iota(p1.begin(), p1.end(), 0);
  std::vector<std::int64_t> p2 = shape_x2;
  std::iota(p2.begin(), p2.end(), 0);

  const std::size_t n_dims_x1 = p1.size();
  const std::size_t n_dims_x2 = p2.size();
  const std::size_t n_rdims = rdims_x1.size();

  // Negative (relative) indexing -> absolute indexing
  for (std::int64_t &rdim : rdims_x1) {
    if (rdim < 0) {
      rdim += n_dims_x1;
    }
  }

  std::vector<bool> rdims_x1_bs(n_dims_x1);
  std::vector<bool> rdims_x2_bs(n_dims_x2);
  for (std::size_t i = 0; i < n_rdims; i++) {
    rdims_x1_bs[rdims_x1[i]] = true;
    rdims_x2_bs[rdims_x2[i]] = true;
    // prod(rdims_x1) == prod(rdims_x2) so just use x1
    rdims_prod *= shape_x1[rdims_x1[i]];
  }

  // Permutes x according to existing permutation vector p and bitset bs. If
  // should_partition_front == true, elements of p are moved to the front
  // if the corresponding bool in bs == true. Otherwise, they are moved to
  // the back. The relative order of other elements must not change.
  const auto fn_partition_permute = [&](torch::jit::Value *x, auto &p,
                                        const auto &bs,
                                        bool should_partition_front) {
    std::stable_partition(p.begin(), p.end(), [&](std::int64_t n) {
      return bs[n] == should_partition_front;
    });
    return createTranspose(graph, {x}, p);
  };

  // Permute x1 so that rdims_x1 are the last dims
  torch::jit::Node *p_x1 = fn_partition_permute(x1, p1, rdims_x1_bs, false);

  // Reshape to (-1, rdims_prod(rdims))
  torch::jit::Node *p_x1_mat =
      createReshape(graph, p_x1->output(), {-1, rdims_prod});

  // Permute x2 so that rdims_x2 are the first dims
  torch::jit::Node *p_x2 = fn_partition_permute(x2, p2, rdims_x2_bs, true);

  // Reshape to (rdims_prod(rdims), -1)
  torch::jit::Node *p_x2_mat =
      createReshape(graph, p_x2->output(), {rdims_prod, -1});

  // Matmul -> (unreduced_x1, unreduced_x2)
  torch::jit::Node *mm =
      createMatmul(graph, {p_x1_mat->output(), p_x2_mat->output()});

  std::vector<std::int64_t> new_shape;
  new_shape.reserve(n_dims_x1 + n_dims_x2);
  for (std::size_t i = 0; i < n_dims_x1; i++) {
    if (!rdims_x1_bs[i]) {
      new_shape.push_back(shape_x1[i]);
    }
  }
  for (std::size_t i = 0; i < n_dims_x2; i++) {
    if (!rdims_x2_bs[i]) {
      new_shape.push_back(shape_x2[i]);
    }
  }

  // Restore flattened dims
  return createReshape(graph, mm->output(), new_shape);
}

bool isIndexBroadcastEnabled(torch::jit::Node *node) {
  static const auto bcast_attr = c10::Symbol::attr("enable_index_broadcast");

  return node->hasAttribute(bcast_attr) ? static_cast<bool>(node->i(bcast_attr))
                                        : false;
}

torch::jit::Node *scatterAddHandler(torch::jit::Graph *graph,
                                    torch::jit::Node *node) {

  static constexpr std::int32_t sum_reduce =
      static_cast<std::int32_t>(ScatterReduction::Sum);

  auto *output = node->input(0);
  auto *index = node->input(2);
  auto *src = node->input(3);
  const auto src_type = src->type()->expect<c10::TensorType>();
  const auto axis = handleDimensionParam(node->input(1), src_type);
  const auto shape = shapeFromTensor(node->output());
  const auto axissize = shape.at(axis);
  const auto enable_index_broadcast = isIndexBroadcastEnabled(node);

  if (isTensorConstant(output->node())) {
    // output may have been generated by calling zeros(...) and at this point
    // in the canonicalization the node is represented as a tensor constant.
    auto out_tensor = getNodeTensorAttrValue(output->node());
    const auto scalar_zero = at::zeros(1, out_tensor.dtype());
    const bool all_zeros = at::all(out_tensor.eq(scalar_zero)).item().toBool();

    if (all_zeros) {
      logging::trace("Removing zeros output to scatter_add: {}",
                     nodeToString(output->node()));
      markNodeForDeletion(output->node());
      return createScatterreduce(graph, {src, index}, axissize, axis,
                                 enable_index_broadcast, sum_reduce);
    }
  }

  return createScatterreduce(graph, {src, index, output}, axissize, axis,
                             enable_index_broadcast, sum_reduce);
}

torch::jit::Node *
meanScatterReduceHandler(torch::jit::Graph *graph, torch::jit::Value *self,
                         torch::jit::Value *index, torch::jit::Value *src,
                         const std::int64_t axis, const std::int64_t axissize,
                         const bool include_self,
                         const bool enable_index_broadcast) {
  static constexpr int32_t sum_reduce =
      static_cast<std::int32_t>(ScatterReduction::Sum);
  auto *ones_self =
      createConstantFloatLike(graph, src, {1.0}, shapeFromTensor(self));
  auto *ones_src =
      createConstantFloatLike(graph, src, {1.0}, shapeFromTensor(src));
  torch::jit::Node *count;
  if (include_self) {
    // Count the number of elements reduced to each index.
    count = createScatterreduce(
        graph, {ones_src->output(), index, ones_self->output()}, axissize, axis,
        enable_index_broadcast, sum_reduce);
  } else {
    static constexpr int32_t none_reduce =
        static_cast<std::int32_t>(ScatterReduction::None);
    auto *zeros_src =
        createConstantFloatLike(graph, src, {0.0}, shapeFromTensor(src));

    // Tensor with zeros where the indices are updated and ones otherwise.
    auto *count_mask = createScatterreduce(
        graph, {zeros_src->output(), index, ones_self->output()}, axissize,
        axis, enable_index_broadcast, none_reduce);

    // Count the number of elements reduced to each index.
    count = createScatterreduce(
        graph, {ones_src->output(), index, count_mask->output()}, axissize,
        axis, enable_index_broadcast, sum_reduce);

    // Put zeros in those indices in self tensor that are not updated,
    // so that they don't impact the reduction result (include_self=False).
    auto *masked_self =
        createScatterreduce(graph, {zeros_src->output(), index, self}, axissize,
                            axis, enable_index_broadcast, none_reduce);
    self = masked_self->output();
  }

  // Sum reduction and then division to calculate `mean`.
  auto *sr = createScatterreduce(graph, {src, index, self}, axissize, axis,
                                 enable_index_broadcast, sum_reduce);
  return createDiv(graph, {sr->output(), count->output()});
}

torch::jit::Node *scatterReduce(torch::jit::Graph *graph,
                                torch::jit::Node *node,
                                const bool enable_index_broadcast) {
  // Signature for scatter_reduce
  // (Tensor src, int dim, Tensor index, Tensor src, string reduce,
  //  bool include_self)
  auto *self = node->input(0);
  auto *dim = node->input(1);
  auto *index = node->input(2);
  auto *src = node->input(3);
  const auto reduce = getReductionMethod(node->input(4)->node());
  const bool include_self = constantToBool(node->input(5)->node());
  const auto src_type = src->type()->expect<c10::TensorType>();
  const auto axis = handleDimensionParam(dim, src_type);
  const auto outshape = shapeFromTensor(node->output(0));
  const auto axissize = outshape.at(axis);

  if (reduce == static_cast<std::int32_t>(ScatterReduction::Mean)) {
    // `Mean` is decomposed as two scatter_reduce sums.
    return meanScatterReduceHandler(graph, self, index, src, axis, axissize,
                                    include_self, enable_index_broadcast);
  }

  if (!include_self) {
    // Mask those indices in `self` that are specified by `index`
    auto *init = createConstantFloatLike(
        graph, src, {getReductionInitValue(reduce)}, shapeFromTensor(src));
    static constexpr std::int32_t none_reduce =
        static_cast<std::int32_t>(ScatterReduction::None);
    auto *masked_self =
        createScatterreduce(graph, {init->output(), index, self}, axissize,
                            axis, enable_index_broadcast, none_reduce);
    return createScatterreduce(graph, {src, index, masked_self->output()},
                               axissize, axis, enable_index_broadcast, reduce);
  }

  return createScatterreduce(graph, {src, index, self}, axissize, axis,
                             enable_index_broadcast, reduce);
}

torch::jit::Node *scatterReduceHandler(torch::jit::Graph *graph,
                                       torch::jit::Node *node) {
  const bool enable_index_broadcast = isIndexBroadcastEnabled(node);
  return scatterReduce(graph, node, enable_index_broadcast);
}

torch::jit::Node *indexReduceHandler(torch::jit::Graph *graph,
                                     torch::jit::Node *node) {
  static constexpr bool enable_index_broadcast = true;
  return scatterReduce(graph, node, enable_index_broadcast);
}

torch::jit::Node *weightNormHandler(torch::jit::Graph *graph,
                                    torch::jit::Node *node) {
  // aten::_weight_norm(Tensor v, Tensor g, int dim) -> Tensor
  auto *v = node->input(0);
  auto *g = node->input(1);
  const auto shape = shapeFromTensor(v);
  auto dim = constantToLong(node->input(2)->node());
  // Correct negative indices
  // PyTorch handles dim -1 in a special way - it computes the
  // norm over all dimensions. We handle that case separately
  if (dim < -1) {
    dim += shape.size();
  }
  std::vector<std::int64_t> axes(shape.size());
  std::iota(axes.begin(), axes.end(), 0);

  // If we have the special case dim -1: We don't erase any
  // axes so that the norm is computed over all dimensions)
  if (dim != -1) {
    axes.erase(axes.begin() + dim);
  }

  std::vector<torch::jit::Value *> axes_constants;
  axes_constants.reserve(axes.size());
  for (auto d : axes) {
    axes_constants.push_back(wrapInConstant1D(graph, d));
  }
  // tensorNormHandler expects ListConstruct for axes_constants
  torch::jit::Value *axes_list =
      createAndInsertNode(graph, c10::prim::ListConstruct, axes_constants)
          ->output();
  // Order 2 norm
  auto *p = wrapInConstant1D(graph, 2);
  // Keep the normalised dims to enable broadcasting
  auto *keepdim = wrapInConstant1D(graph, 1);

  // tensorNormHandler
  auto norm_handler = getHandler(c10::aten::norm);
  // PyTorch defines the weight calculation as
  //   w = g * v / norm(v)
  // This can be rewritten as
  //   w = v * g / norm(v)
  // Which is slightly more efficient, since it doesn't require
  // expanding g to be broadcastable with v
  auto *norm_v =
      createHandlerOperation(graph, norm_handler, {v, p, axes_list, keepdim});
  auto *scaled_v = createDiv(graph, {g, norm_v->output()});

  return createMul(graph, {v, scaled_v->output()});
}

torch::jit::Node *setAvailableMemoryHandler(torch::jit::Graph *graph,
                                            torch::jit::Node *node) {
  // poptorch::set_available_memory(Tensor, float) -> Tensor
  auto *x = node->input(0);
  auto *y = node->input(1);
  const auto t0 = constantToFloat(y->node());
  return createSetAvailableMemory(graph, x, t0);
}

torch::jit::Node *randintHandler(torch::jit::Graph *graph,
                                 torch::jit::Node *node) {
  auto *out = node->output(0);
  const auto shape = shapeFromTensor(out);
  const auto scalar_type = getNodeScalarType(out);
  // Note: the popart range is closed whereas the pytorch range is expected to
  // be half open
  const auto high = constantToFloat(node->input(1)->node()) - 1.0f;
  const auto low = constantToFloat(node->input(0)->node());
  auto *ints =
      createRandomUniform(graph, out, shape, high, low, c10::ScalarType::Int);
  return createCast(graph, ints->output(0), scalar_type);
}

torch::jit::Node *randomHandler(torch::jit::Graph *graph,
                                torch::jit::Node *node) {
  auto *out = node->input(0);
  const auto shape = shapeFromTensor(out);
  const auto scalar_type = getNodeScalarType(out);
  // Note: the popart range is closed whereas the pytorch range is expected to
  // be half open
  const auto high = constantToFloat(node->input(2)->node()) - 1.0f;
  const auto low = constantToFloat(node->input(1)->node());
  auto *ints =
      createRandomUniform(graph, out, shape, high, low, c10::ScalarType::Int);
  return createCast(graph, ints->output(0), scalar_type);
}
} // namespace

__attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() {
  registerHandler(c10::aten::bincount, bincountHandler);
  registerHandler(c10::aten::bucketize, bucketizeHandler);
  registerHandler(c10::aten::einsum, einsumHandler);
  registerHandler(c10::aten::meshgrid, meshgridHandler);
  registerHandler(c10::aten::cartesian_prod, cartesianProdHandler);
  registerHandler(c10::aten::tensordot, tensordotHandler);
  registerHandler(c10::aten::scatter_add, scatterAddHandler);
  registerHandler(c10::aten::scatter_reduce, scatterReduceHandler);
  registerHandler(c10::aten::index_reduce, indexReduceHandler);
  registerHandler(c10::aten::_weight_norm, weightNormHandler);
  registerHandler(c10::aten::randint, randintHandler);
  registerHandler(c10::aten::random_, randomHandler);
  registerHandler(symbols::poptorch::set_available_memory,
                  setAvailableMemoryHandler);
}

} // namespace poptorch


================================================
FILE: poptorch/source/popart_canonicalization/PoolingOps.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#include <spdlog/fmt/fmt.h>
#include <spdlog/fmt/ostr.h>

#include "../PoptorchStaticInit.hpp"
#include "PopartCanonicalizationUtils.hpp"

#include "poptorch/OpBuilder.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

namespace poptorch {
namespace {
torch::jit::Node *poolingHandler(torch::jit::Graph *graph,
                                 torch::jit::Node *node) {
  const torch::jit::Symbol kind = node->kind();

  // aten::max_pool2d(Tensor self, int[] kernel_size, int[] stride, int[]
  // padding, int[] dilation, bool ceil_mode) -> Tensor
  //
  // aten::avg_pool2d(Tensor self, int[] kernel_size, int[] stride, int[]
  //                   padding, bool ceil_mode, bool count_include_pad,
  //                   int? divisor_override) -> Tensor

  // aten::max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2]
  // stride=[], int[2] padding=[0, 0], int[2] dilation=[1, 1], bool
  // ceil_mode=False) -> (Tensor, Tensor)

  torch::jit::Value *x = node->input(0);
  const auto kernel_size = constantToLongVec(node->input(1)->node());
  const auto stride = constantToLongVec(node->input(2)->node());
  auto padding = constantToLongVec(node->input(3)->node());
  auto shape = shapeFromTensor(x);
  bool reshape_after = false;

  // The torch input might be missing the batch dimension, so add one if
  // necessary
  // (C, *in) -> (1, C, *in)
  if (shape.size() != stride.size() + 2) {
    shape.push_back(1);
    // simple rotation to the right
    std::rotate(shape.rbegin(), shape.rbegin() + 1, shape.rend());
    x = createReshape(graph, x, shape)->output();
    reshape_after = true;
  }

  // If we reshape, the output shape will be (1, C, *out) but torch expects
  // (C, *out)
  const auto maybe_reshape_output = [&](torch::jit::Node *output) {
    if (reshape_after) {
      return createReshape(graph, output->output(),
                           shapeFromTensor(node->output()));
    }
    return output;
  };

  // Pytorch gives the padding as being the amount to pad in both
  // directions. Popart two arguments for each axis, the amount to pad in
  // each direction along that axis. In the form (Axis0Left, AxisNLeft...,
  // Axis0Right, AxisNRight) where left and right refer to the direction
  // along the axis to add zeros to.
  const std::size_t num_pads = padding.size();
  for (std::size_t pad_index = 0; pad_index < num_pads; ++pad_index) {
    padding.push_back(padding[pad_index]);
  }

  const bool is_max_pool = kind == c10::aten::max_pool1d ||
                           kind == c10::aten::max_pool2d ||
                           kind == c10::aten::max_pool3d ||
                           kind == c10::aten::max_pool1d_with_indices ||
                           kind == c10::aten::max_pool2d_with_indices ||
                           kind == c10::aten::max_pool3d_with_indices;

  if (is_max_pool) {
    const auto dilations = constantToLongVec(node->input(4)->node());
    const auto ceil_mode = constantToLong(node->input(5)->node());

    auto *output = createMaxpool(graph, {x}, 1, kernel_size, ceil_mode,
                                 dilations, padding, 0, stride);
    return maybe_reshape_output(output);
  }

  // divisor_override is ignored for now due to not being supported directly in
  // popart.
  const auto ceil_mode = constantToLong(node->input(4)->node());

  const bool count_include_pad = constantToBool(node->input(5)->node());
  // count_include_pad isn't supported in PopART so we check and pad manually if
  // the average pool is supposed to include the padding in its average.
  if (count_include_pad) {
    x = createConstantPad(graph, x, padding, 0.f)->output();
    // Ensure that padding isn't added twice.
    padding = {};
  }

  // popart only supports float types for avgpool
  const auto input_type = getNodeScalarType(x);

  if (input_type == c10::kFloat) {
    auto *output = createAveragepool(graph, {x}, kernel_size, ceil_mode, 0,
                                     padding, stride);
    return maybe_reshape_output(output);
  }

  // all other types require casting via float
  x = createCast(graph, x, c10::kFloat)->output();
  x = createAveragepool(graph, {x}, kernel_size, ceil_mode, 0, padding, stride)
          ->output();
  auto *output = createCast(graph, x, input_type);
  return maybe_reshape_output(output);
}

torch::jit::Node *adaptivePoolingHandler(torch::jit::Graph *graph,
                                         torch::jit::Node *node) {
  // aten::adaptive_avg_pool1d(Tensor self, int[] output_size) -> Tensor
  // aten::adaptive_avg_pool2d(Tensor self, int[] output_size) -> Tensor
  // aten::adaptive_avg_pool3d(Tensor self, int[] output_size) -> Tensor

  torch::jit::Value *x = node->input(0);
  const std::vector<std::int64_t> output_shape =
      constantToLongVec(node->input(1)->node());
  const std::size_t n_output_dims = output_shape.size();

  const std::vector<std::int64_t> input_shape = shapeFromTensor(x);
  const std::size_t input_offset = input_shape.size() - n_output_dims;

  std::vector<std::int64_t> stride(n_output_dims);
  std::vector<std::int64_t> kernel_shape(n_output_dims);
  for (std::size_t i = 0; i < n_output_dims; i++) {
    const std::int64_t in_dim = input_shape[input_offset + i];
    const std::int64_t out_dim = output_shape[i];
    // This matches PyTorch's implementation as long as each input dim is
    // divisible by the corresponding output dim. If this is not the case, the
    // shape will be correct but the output will differ.
    if (in_dim % out_dim != 0) {
      const auto msg =
          fmt::format("Input dim {} ({}) is not divisible by the corresponding "
                      "output dim ({}). The results will differ numerically "
                      "from PyTorch's implementation.",
                      i, in_dim, out_dim);
      ERROR(msg);
    }
    stride[i] = in_dim / out_dim;
    kernel_shape[i] = in_dim - (out_dim - 1) * stride[i];
  }

  const std::vector<std::int64_t> padding(n_output_dims * 2, 0);
  return createAveragepool(graph, {x}, kernel_shape, 0, 0, padding, stride);
}

} // namespace

__attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() {
  registerHandler(c10::aten::max_pool1d, poolingHandler);
  registerHandler(c10::aten::avg_pool1d, poolingHandler);
  registerHandler(c10::aten::max_pool2d, poolingHandler);
  registerHandler(c10::aten::avg_pool2d, poolingHandler);
  registerHandler(c10::aten::max_pool3d, poolingHandler);
  registerHandler(c10::aten::avg_pool3d, poolingHandler);

  registerHandler(c10::aten::max_pool1d_with_indices, poolingHandler);
  registerHandler(c10::aten::max_pool2d_with_indices, poolingHandler);
  registerHandler(c10::aten::max_pool3d_with_indices, poolingHandler);

  registerHandler(c10::aten::adaptive_avg_pool1d, adaptivePoolingHandler);
  registerHandler(c10::aten::adaptive_avg_pool2d, adaptivePoolingHandler);
  registerHandler(c10::aten::adaptive_avg_pool3d, adaptivePoolingHandler);
}

} // namespace poptorch


================================================
FILE: poptorch/source/popart_canonicalization/PopartCanonicalizationUtils.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.

#include <functional>
#include <numeric>
#include <unordered_map>

#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

#include "poptorch/DispatchTracer.hpp"
#include "poptorch/OpBuilder.hpp"
#include "poptorch/Utils.hpp"

#include "../PoptorchSymbols.hpp"

#include "PopartCanonicalizationUtils.hpp"

namespace poptorch {

namespace {

const c10::Symbol delete_node_attr = c10::Symbol::attr("delete_node");

// This avoids the static initialisation order fiasco,
std::unordered_map<c10::Symbol, SymbolHandler> &symbolHandlers() {
  static std::unordered_map<c10::Symbol, SymbolHandler> symbol_handlers;
  return symbol_handlers;
}
} // namespace

bool registerHandler(c10::Symbol symbol, const SymbolHandler &handler) {
  logging::trace("Registering handler for symbol {}", symbol.toDisplayString());
  const bool new_handler = symbolHandlers().emplace(symbol, handler).second;
  ERROR_ON_MSG(!new_handler, "Symbol " << symbol.toDisplayString()
                                       << " already has a handler registered");
  return new_handler;
}

// Return a pointer to a handler if one is registered for this kind of node or
// an empty std::function otherwise.
SymbolHandler getHandler(torch::jit::NodeKind kind) {
  const auto it = symbolHandlers().find(kind);
  if (it != symbolHandlers().cend()) {
    return it->second;
  }
  return {};
}

bool allInputsOfType(torch::jit::Node *node, at::ScalarType type,
                     int ignore_input) {
  int idx = 0;
  for (const auto &input : node->inputs()) {
    if (idx++ == ignore_input) {
      continue;
    }

    const auto tensor_type = input->type()->cast<c10::TensorType>();
    ERROR_ON(!tensor_type);
    ERROR_ON(!tensor_type->scalarType());

    if ((*tensor_type->scalarType()) != type) {
      return false;
    }
  }
  return true;
}

bool allInputsBool(torch::jit::Node *node, int ignore_input) {
  return allInputsOfType(node, at::ScalarType::Bool, ignore_input);
}

bool allInputsInteger(torch::jit::Node *node, int ignore_input) {
  int idx = 0;
  for (const auto &input : node->inputs()) {
    if (idx++ == ignore_input) {
      continue;
    }

    const auto tensor = input->type()->cast<c10::TensorType>();
    ERROR_ON(!tensor);
    ERROR_ON(!tensor->scalarType());

    if (!isIntegralType(*tensor->scalarType(), false)) {
      return false;
    }
  }
  return true;
}

std::vector<torch::jit::Value *> handleTensorList(torch::jit::Node *node) {
  const auto inputs = node->inputs();
  // // Just convert the node->inputs array ref to vector and return it.
  return std::vector<torch::jit::Value *>(inputs.cbegin(), inputs.cend());
}

// Add a vector of ints to the IR as a constant.
torch::jit::Value *
intVectorToIrConstant(torch::jit::Graph *graph,
                      const std::vector<std::int64_t> &ints) {
  const std::vector<std::int64_t> dimensions = {
      static_cast<std::int64_t>(ints.size())};
  return createConstantInt(graph, ints, dimensions)->output();
}

// Get the shape of a tensor and add it to the graph as a constant value.
torch::jit::Value *shapeFromTensorAsIR(torch::jit::Graph *graph,
                                       torch::jit::Value *value) {
  // Extract the type from the pytorch IR.
  const std::vector<std::int64_t> shape = shapeFromTensor(value);
  return intVectorToIrConstant(graph, shape);
}

// Get the scalar type of a given tensor.
at::ScalarType getNodeScalarType(const torch::jit::Value *tensor) {
  // The returned value must be a tensor.
  c10::TensorTypePtr const return_tensor =
      tensor->type()->expect<c10::TensorType>();

  // Deduce the type from the scalar type on the return.
  return *return_tensor->scalarType();
}

bool hasUnityValue(torch::jit::Value *value) {
  const auto tensor = getNodeTensorAttrValue(value->node());
  if (tensor.numel() != 1) {
    return false;
  }
  return tensor.to(at::ScalarType::Float).item<float>() == 1.0;
}

bool isNone(torch::jit::Node *node) {
  if (node->kind() != c10::prim::Constant) {
    return false;
  }

  const auto sym = c10::attr::value;
  return !node->hasAttribute(sym);
}

bool isNone(const torch::jit::Value *value) {
  return value->type()->cast<c10::NoneType>();
}

std::int64_t handleDimensionParam(torch::jit::Value *value,
                                  const c10::TensorTypePtr &as_tensor) {
  // Extract the dim.
  std::int64_t dim = constantToLong(value->node());
  c10::VaryingShape const dims = as_tensor->sizes();

  // If dim is less than zero subtract it to get the actual dimension.
  if (dim < 0) {
    dim = *dims.size() + dim;
  }

  // Return the dim.
  return dim;
}

bool isAnyConstant(torch::jit::Node *node) {
  return isTensorConstant(node) || node->kind() == c10::prim::Constant;
}

bool isFloatingPointConstant(torch::jit::Node *node) {
  const auto tensor_type = node->output()->type()->cast<c10::TensorType>();
  if (tensor_type) {
    const auto scalar_type = *tensor_type->scalarType();
    return c10::isFloatingType(scalar_type);
  }

  ERROR_ON(!node->output()->type()->isSubtypeOf(c10::NumberType::get()));
  return torch::jit::constant_as<at::Scalar>(node->output())
      .value()
      .isFloatingPoint();
}

bool isTensorConstant(torch::jit::Node *node) {
  return (node->kind() == symbols::poptorch::tensor_constant ||
          node->kind() == symbols::poptorch::host_side_tensor_constant);
}

bool isConstantScalar(torch::jit::Value *input) {
  if (!isTensorConstant(input->node())) {
    return false;
  }

  const std::vector<int64_t> shape = shapeFromTensor(input);
  const int64_t numel = std::accumulate(shape.begin(), shape.end(), 1,
                                        std::multiplies<int64_t>());

  return numel == 1;
}

float constantToFloat(torch::jit::Node *node) {
  ERROR_ON_MSG(!isTensorConstant(node), "Cannot force a non-constant '"
                                            << node->kind().toQualString()
                                            << "' node to a float");
  if (node->output()->type()->cast<c10::TensorType>()) {
    return getNodeTensorAttrValue(node).to(at::ScalarType::Float).item<float>();
  }

  ERROR_ON(!node->output()->type()->isSubtypeOf(c10::NumberType::get()));
  const auto s = torch::jit::constant_as<at::Scalar>(node->output());
  return s.value().toFloat();
}

torch::jit::Node *constantToLongConstant(torch::jit::Node *node) {
  ERROR_ON_MSG(!isTensorConstant(node), "Cannot force a non-constant '"
                                            << node->kind().toQualString()
                                            << "' node to a long constant");

  ERROR_ON(!node->output()->type()->cast<c10::TensorType>());
  setNodeTensorAttrValue(node,
                         getNodeTensorAttrValue(node).to(at::ScalarType::Long));
  node->output()->inferTypeFrom(getNodeTensorAttrValue(node));
  return node;
}

std::int32_t constantToInt(torch::jit::Node *node) {
  ERROR_ON_MSG(!isTensorConstant(node), "Cannot force a non-constant '"
                                            << node->kind().toQualString()
                                            << "' node to an int");

  if (node->output()->type()->cast<c10::TensorType>()) {
    return getNodeTensorAttrValue(node)
        .to(at::ScalarType::Int)
        .item<std::int32_t>();
  }

  ERROR_ON(!node->output()->type()->isSubtypeOf(c10::NumberType::get()));
  const auto s = torch::jit::constant_as<at::Scalar>(node->output());
  return s.value().toInt();
}

std::int64_t constantToLong(torch::jit::Node *node) {
  ERROR_ON_MSG(!isTensorConstant(node), "Cannot force a non-constant '"
                                            << node->kind().toQualString()
                                            << "' node to a long");

  if (node->output()->type()->cast<c10::TensorType>()) {
    return getNodeTensorAttrValue(node)
        .to(at::ScalarType::Long)
        .item<std::int64_t>();
  }
  ERROR_ON(!node->output()->type()->isSubtypeOf(c10::NumberType::get()));
  const auto s = torch::jit::constant_as<at::Scalar>(node->output());
  const std::int64_t val = s.value().toLong();

  return val == INT_MAX ? LONG_MAX : val;
}

std::vector<std::int64_t> constantToLongVec(torch::jit::Node *node) {
  return constantListToVec<std::int64_t>(node, constantToLong);
}

std::vector<float> constantToFloatVec(torch::jit::Node *node) {
  return constantListToVec<float>(node, constantToFloat);
}

bool constantToBool(torch::jit::Node *node) {
  ERROR_ON_MSG(!isTensorConstant(node),
               "Cannot force a non-constant node to a bool");

  return constantToInt(node) != 0;
}

std::string constantToString(torch::jit::Node *node) {
  ERROR_ON_MSG(!isTensorConstant(node),
               "Cannot force a non-constant node to a string");

  auto &&t = getNodeTensorAttrValue(node);
  ERROR_ON(!t.is_contiguous());

  const auto length = t.sizes().at(0);
  std::string s(reinterpret_cast<char *>(t.data_ptr()), length);
  return s;
}

at::ScalarType constantToScalarType(torch::jit::Node *node) {
  const auto as_num = constantToInt(node);
  ERROR_ON_MSG(as_num < 0 || as_num > at::NumScalarTypes,
               "Node has a value (" << as_num
                                    << ") which is not "
                                       "representable as a torch dtype");

  return static_cast<at::ScalarType>(as_num);
}

std::int32_t convertReduceToPopart(std::int32_t pytorchReduce) {
  // Popart:
  // Sum = 0, Mean =1, NoReduction = 2
  // Pytorch
  // Sum = 2, Mean =1, NoReduction = 0
  if (pytorchReduce == 0) {
    return 2;
  }
  if (pytorchReduce == 1) {
    return 1;
  }
  if (pytorchReduce == 2) {
    return 0;
  }

  ERROR("Unsupported pytorch reduce");
}

void markNodeForDeletion(torch::jit::Node *node) {
  node->i_(delete_node_attr, 1);
}

bool isMarkedForDeletion(torch::jit::Node *node) {
  return node->hasAttribute(delete_node_attr) && node->i(delete_node_attr) > 0;
}

void replaceOutputUse(torch::jit::Value *old_val, torch::jit::Value *new_val) {
  // Make sure the new value matches the type of the original value.
  new_val->setType(old_val->type());

  // Replace the old value with the new one.
  old_val->replaceAllUsesWith(new_val);
}

void replaceOutputUse(torch::jit::Node *oldNode, torch::jit::Node *new_node,
                      std::uint64_t outputIdx) {
  logging::trace("Replacing node output %{} with that of {}",
                 oldNode->output(outputIdx)->debugName(), *new_node);

  torch::jit::Value *new_val = new_node->output(outputIdx);
  torch::jit::Value *old_val = oldNode->output(outputIdx);
  replaceOutputUse(old_val, new_val);
}

// An odd function which returns each tensor dimension as an array, a helper for
// torch.max(tensor) and torch.min(tensor). I.E a 4D tensor will return (0, 1,
// 2, 3).
std::vector<std::int64_t>
reduceHelperDimensionCreator(torch::jit::Value *value) {
  // Extract the type from the pytorch IR.
  c10::TensorTypePtr const as_tensor = value->type()->expect<c10::TensorType>();
  c10::VaryingShape const dims = as_tensor->sizes();

  // Convert that IR type into a C++ vector of ints.
  std::vector<std::int64_t> shape(dims.sizes()->size());
  // Fill the vector with sequentially incrementing values.
  std::iota(shape.begin(), shape.end(), 0);

  return shape;
}

bool attributeEqual(torch::jit::Node *a, torch::jit::Node *b,
                    c10::Symbol attr) {
  if (!a->hasAttribute(attr) || !b->hasAttribute(attr)) {
    return false;
  }

  const auto attr_kind = a->kindOf(attr);
  if (b->kindOf(attr) != attr_kind) {
    return false;
  }

  switch (attr_kind) {
  case torch::jit::AttributeKind::f:
    return a->f(attr) == b->f(attr);
  case torch::jit::AttributeKind::fs:
    return a->fs(attr) == b->fs(attr);
  case torch::jit::AttributeKind::s:
    return a->s(attr) == b->s(attr);
  case torch::jit::AttributeKind::ss:
    return a->ss(attr) == b->ss(attr);
  case torch::jit::AttributeKind::i:
    return a->i(attr) == b->i(attr);
  case torch::jit::AttributeKind::is:
    return a->is(attr) == b->is(attr);
  case torch::jit::AttributeKind::t:
    return a->t(attr).equal(b->t(attr));
  case torch::jit::AttributeKind::ts: {
    if (a->ts(attr).size() != b->ts(attr).size()) {
      return false;
    }
    auto a_it = a->ts(attr).cbegin();
    auto b_it = b->ts(attr).cbegin();
    for (; a_it != a->ts(attr).cend(); a_it++, b_it++) {
      if (!a_it->equal(*b_it)) {
        return false;
      }
    }
    return true;
  }
  case torch::jit::AttributeKind::g:
    return a->g(attr) == b->g(attr);
  case torch::jit::AttributeKind::gs:
    return a->gs(attr) == b->gs(attr);
  case torch::jit::AttributeKind::c:
    return a->c(attr) == b->c(attr);
  case torch::jit::AttributeKind::cs:
    return a->cs(attr) == b->cs(attr);
  case torch::jit::AttributeKind::ty:
    return a->ty(attr) == b->ty(attr);
  case torch::jit::AttributeKind::tys:
    return a->tys(attr) == b->tys(attr);
  case torch::jit::AttributeKind::ival:
    return a->ival(attr) == b->ival(attr);
  }

  ERROR("Invalid type in attributeSame.");
}

torch::jit::Value *castToPromoteType(torch::jit::Graph *graph,
                                     torch::jit::Value *tensor,
                                     c10::ScalarType promoteType) {
  if (getNodeScalarType(tensor) != promoteType) {
    return createCast(graph, tensor, promoteType)->output();
  }

  return tensor;
}

} // namespace poptorch


================================================
FILE: poptorch/source/popart_canonicalization/PopartCanonicalizationUtils.hpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#ifndef SOURCE_POPART_CANONICALIZATION_UTILS_H
#define SOURCE_POPART_CANONICALIZATION_UTILS_H
#include <torch/csrc/jit/ir/ir.h>

#include <functional>
#include <string>
#include <vector>

#include "poptorch_logging/Error.hpp"

namespace poptorch {

using SymbolHandler =
    std::function<torch::jit::Node *(torch::jit::Graph *, torch::jit::Node *)>;

bool registerHandler(c10::Symbol symbol, const SymbolHandler &handler);

std::vector<std::int64_t>
reduceHelperDimensionCreator(torch::jit::Value *value);

inline std::vector<std::int64_t>
reduceHelperDimensionCreator(torch::jit::Value *value,
                             const std::vector<std::int64_t> &axes) {
  if (!axes.empty()) {
    return axes;
  }
  return reduceHelperDimensionCreator(value);
}

// Return a pointer to a handler if one is registered for this kind of node or
// an empty std::function otherwise.
SymbolHandler getHandler(torch::jit::NodeKind kind);

// Returns true if all inputs to `node` are of the given `type`.
//
// \param ignore_input Index of an input to not check.
//
// \note Errors if a not-ignored input is not a tensor, or doesn't have a scalar
//       type.
bool allInputsOfType(torch::jit::Node *node, at::ScalarType type,
                     int ignore_input = -1);

// Returns true if all inputs are Bools
bool allInputsBool(torch::jit::Node *node, int ignore_input = -1);

// Returns true if all inputs are of integral type, compatible with
// c10::isIntegralType.
bool allInputsInteger(torch::jit::Node *node, int ignore_input = -1);

// Get the tensor shape and add it to the IR as a constant primitive.
torch::jit::Value *shapeFromTensorAsIR(torch::jit::Graph *graph,
                                       torch::jit::Value *value);

// Get the scalar type of this tensor.
at::ScalarType getNodeScalarType(const torch::jit::Value *tensor);

torch::jit::Value *intVectorToIrConstant(torch::jit::Graph *graph,
                                         const std::vector<std::int64_t> &ints);

std::vector<torch::jit::Value *> handleTensorList(torch::jit::Node *node);

// Returns true if the value is a constant of exactly unity (1)
bool hasUnityValue(torch::jit::Value *value);

// Some operations take in an optional tensor. A "none" constant is passed in to
// mark a tensor which is not there.
bool isNone(torch::jit::Node *node);
bool isNone(const torch::jit::Value *value);

std::int64_t handleDimensionParam(torch::jit::Value *value,
                                  const c10::TensorTypePtr &as_tensor);

bool isAnyConstant(torch::jit::Node *node);

bool isFloatingPointConstant(torch::jit::Node *node);

bool isTensorConstant(torch::jit::Node *node);

// Does the given value (tensor) represent a single, scalar value?
//
// Useful in cases of broadcasting.
bool isConstantScalar(torch::jit::Value *input);

// Force a constant to be a float: this is appropriate if required for popart
// (onnx); e.g. Gemm alpha and beta are always floats
float constantToFloat(torch::jit::Node *node);

// Force a constant to be a long constant by casting.
// This is appropriate if required for popart (onnx)
// e.g. TopK takes int64 indices as a tensor.
torch::jit::Node *constantToLongConstant(torch::jit::Node *node);

// Force a constant to be an int: this is appropriate if required for popart
// (onnx)
std::int32_t constantToInt(torch::jit::Node *node);

// Force a constant to be a long: this is appropriate if required for popart
// (onnx) e.g. Slice takes int64 indices
std::int64_t constantToLong(torch::jit::Node *node);

// Forces a ListConstruct to be a vector of int64_ts
std::vector<std::int64_t> constantToLongVec(torch::jit::Node *node);

// Forces a ListConstruct to be a vector of floats
std::vector<float> constantToFloatVec(torch::jit::Node *node);

// Extract a boolean from a constant containing one (encoded as an int32_t)
bool constantToBool(torch::jit::Node *node);

// Extracts a string from a constant containing a string
std::string constantToString(torch::jit::Node *node);

// Extract a `at::ScalarType` from a constant containing a number that
// represents one.
at::ScalarType constantToScalarType(torch::jit::Node *node);

// Forces a ListConstuct into a vector of the given type
template <typename T, typename ExtractFunc>
std::vector<T> constantListToVec(torch::jit::Node *node,
                                 ExtractFunc &&constantExtractFunc) {
  ERROR_ON(node->kind() != c10::prim::ListConstruct);

  auto node_inputs = node->inputs();
  std::vector<T> result;
  result.reserve(node_inputs.size());
  for (torch::jit::Value *value : node_inputs) {
    result.push_back(constantExtractFunc(value->node()));
  }

  return result;
}

// Both pytorch and popart represent reduce as an enum but with different
// values.
std::int32_t convertReduceToPopart(std::int32_t pytorchReduce);

void markNodeForDeletion(torch::jit::Node *node);
bool isMarkedForDeletion(torch::jit::Node *node);

void replaceOutputUse(torch::jit::Value *old_val, torch::jit::Value *new_val);
void replaceOutputUse(torch::jit::Node *oldNode, torch::jit::Node *new_node,
                      std::uint64_t outputIdx);

bool attributeEqual(torch::jit::Node *a, torch::jit::Node *b,
                    c10::Symbol attrb);

template <typename... Tail>
c10::ScalarType promoteTypes(const c10::ScalarType &a, const c10::ScalarType &b,
                             Tail &&...tail) {
  if constexpr (sizeof...(tail) == 0) {
    return c10::promoteTypes(a, b);
  } else {
    return promoteTypes(promoteTypes(a, b), std::forward<Tail>(tail)...);
  }
}

template <typename... Tail>
c10::ScalarType promoteTypes(const torch::jit::Value *a,
                             const torch::jit::Value *b, Tail &&...tail) {
  return promoteTypes(getNodeScalarType(a), getNodeScalarType(b),
                      getNodeScalarType(std::forward<Tail>(tail))...);
}

torch::jit::Value *castToPromoteType(torch::jit::Graph *graph,
                                     torch::jit::Value *tensor,
                                     c10::ScalarType promoteType);

template <typename... Tail>
std::vector<torch::jit::Value *>
promoteTensors(torch::jit::Graph *graph, torch::jit::Value *tensor_a,
               torch::jit::Value *tensor_b, Tail &&...tail) {

  const c10::ScalarType promote_type =
      promoteTypes(tensor_a, tensor_b, std::forward<Tail>(tail)...);

  return {castToPromoteType(graph, tensor_a, promote_type),
          castToPromoteType(graph, tensor_b, promote_type),
          castToPromoteType(graph, std::forward<Tail>(tail), promote_type)...};
}

} // namespace poptorch

#endif // SOURCE_POPART_CANONICALIZATION_UTILS_H


================================================
FILE: poptorch/source/popart_canonicalization/PoptorchHandlers.gen.cpp
================================================
// DO NOT EDIT! Generated by PopTorchHandlers.py
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.

#include "../PoptorchStaticInit.hpp"
#include "../PoptorchSymbols.hpp"
#include "PopartCanonicalizationUtils.hpp"
#include "poptorch/OpBuilder.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

namespace poptorch {

namespace {

torch::jit::Node *beginIpuBlockHandler(torch::jit::Graph *graph,
                                       torch::jit::Node *node) {
  auto *x = node->input(0);
  auto t0 = constantToLong(x->node());
  auto *y = node->input(1);
  auto t1 = constantToLong(y->node());
  auto *z = node->input(2);
  auto t2 = constantToLong(z->node());
  // beginIpuBlock(clong(x), clong(y), clong(z))
  return createBeginIpuBlock(graph, t0, t1, t2);
}

torch::jit::Node *beginMultiConvHandler(torch::jit::Graph * /*graph*/,
                                        torch::jit::Node * /*node*/) {
  // <pass through>
  return nullptr;
}

torch::jit::Node *callCpuOpHandler(torch::jit::Graph *graph,
                                   torch::jit::Node *node) {
  auto *x = node->input(0);
  auto t0 = handleTensorList(x->node());
  auto *s = node->input(1);
  auto t1 = constantToString(s->node());
  auto *original_node = node;
  // callCpuOp(TensorList(x), cstr(s), original_node)
  return createCallCpuOp(graph, t0, t1, original_node);
}

torch::jit::Node *endCpuOpHandler(torch::jit::Graph * /*graph*/,
                                  torch::jit::Node * /*node*/) {
  // <pass through>
  return nullptr;
}

torch::jit::Node *endForLoopHandler(torch::jit::Graph *graph,
                                    torch::jit::Node *node) {
  auto *output = node->input(0);
  auto *inputs = node->input(1);
  auto *trip_count = node->input(2);
  auto t0 = constantToLong(trip_count->node());
  // endForLoop(output, inputs, clong(trip_count))
  return createEndForLoop(graph, output, inputs, t0);
}

torch::jit::Node *endIpuBlockHandler(torch::jit::Graph * /*graph*/,
                                     torch::jit::Node * /*node*/) {
  // <pass through>
  return nullptr;
}

torch::jit::Node *identityLossHandler(torch::jit::Graph *graph,
                                      torch::jit::Node *node) {
  auto *x = node->input(0);
  auto *r = node->input(1);
  auto t0 = constantToInt(r->node());
  // identityloss(x, cint(r))
  return createIdentityloss(graph, {x}, t0);
}

torch::jit::Node *internalCastHandler(torch::jit::Graph *graph,
                                      torch::jit::Node *node) {
  auto *tensor = node->input(0);
  auto *dtype = node->input(1);
  auto t0 = constantToString(dtype->node());
  // internalCast(tensor, cstr(dtype))
  return createInternalCast(graph, tensor, t0);
}

torch::jit::Node *nopHandler(torch::jit::Graph *graph, torch::jit::Node *node) {
  auto *args = node->input(0);
  // nop(args)
  return createNop(graph, {args});
}

torch::jit::Node *optimizerGroupHandler(torch::jit::Graph *graph,
                                        torch::jit::Node *node) {
  auto *x = node->input(0);
  auto t0 = constantToLong(x->node());
  auto *l = node->input(1);
  auto t1 = handleTensorList(l->node());
  // optimizerGroup(clong(x), TensorList(l))
  return createOptimizerGroup(graph, t0, t1);
}

torch::jit::Node *popNameScopeHandler(torch::jit::Graph * /*graph*/,
                                      torch::jit::Node * /*node*/) {
  // <pass through>
  return nullptr;
}

torch::jit::Node *recomputationCheckpointHandler(torch::jit::Graph *graph,
                                                 torch::jit::Node *node) {
  auto *i0 = node->input(0);
  // recomputationCheckpoint(i0)
  return createRecomputationCheckpoint(graph, i0);
}

torch::jit::Node *setMatmulSerializationHandler(torch::jit::Graph *graph,
                                                torch::jit::Node *node) {
  auto *x = node->input(0);
  auto *s = node->input(1);
  auto t0 = constantToString(s->node());
  auto *a = node->input(2);
  auto t1 = constantToLong(a->node());
  auto *b = node->input(3);
  auto t2 = constantToInt(b->node());
  // setMatMulSerialization(x, cstr(s), clong(a), cint(b))
  return createSetMatMulSerialization(graph, x, t0, t1, t2 != 0);
}

torch::jit::Node *startForLoopHandler(torch::jit::Graph *graph,
                                      torch::jit::Node *node) {
  auto *inputs = node->input(0);
  // startForLoop(inputs)
  return createStartForLoop(graph, inputs);
}

torch::jit::Node *startIfBlockHandler(torch::jit::Graph *graph,
                                      torch::jit::Node *node) {
  auto *condition = node->input(0);
  // startIfBlockHandler(condition)
  return createStartIfBlock(graph, condition);
}

torch::jit::Node *startElseBlockHandler(torch::jit::Graph *graph,
                                        torch::jit::Node *node) {
  auto *outputs_then = node->input(0);
  // startElseBlockHandler(outputs_then)
  return createStartElseBlock(graph, outputs_then);
}

torch::jit::Node *endIfBlockHandler(torch::jit::Graph *graph,
                                    torch::jit::Node *node) {
  auto *outputs_else = node->input(0);
  auto *condition = node->input(1);
  // startElseBlockHandler(outputs_else, condition)
  return createEndIfBlock(graph, outputs_else, condition);
}

torch::jit::Node *updateParamInplaceHandler(torch::jit::Graph *graph,
                                            torch::jit::Node *node) {
  auto *i0 = node->input(0);
  auto *i1 = node->input(1);
  // copyvarupdate(i0, i1)
  return createCopyvarupdate(graph, {i0, i1});
}

} // namespace

__attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() {
  registerHandler(symbols::poptorch::begin_ipu_block, beginIpuBlockHandler);
  registerHandler(symbols::poptorch::begin_multi_conv, beginMultiConvHandler);
  registerHandler(symbols::poptorch::call_cpu_op, callCpuOpHandler);
  registerHandler(symbols::poptorch::end_cpu_op, endCpuOpHandler);
  registerHandler(symbols::poptorch::end_for_loop, endForLoopHandler);
  registerHandler(symbols::poptorch::end_ipu_block, endIpuBlockHandler);
  registerHandler(symbols::poptorch::identity_loss, identityLossHandler);
  registerHandler(symbols::poptorch::internal_cast, internalCastHandler);
  registerHandler(symbols::poptorch::nop, nopHandler);
  registerHandler(symbols::poptorch::optimizer_group, optimizerGroupHandler);
  registerHandler(symbols::poptorch::pop_name_scope, popNameScopeHandler);
  registerHandler(symbols::poptorch::recomputation_checkpoint,
                  recomputationCheckpointHandler);
  registerHandler(symbols::poptorch::set_matmul_serialization,
                  setMatmulSerializationHandler);
  registerHandler(symbols::poptorch::start_for_loop, startForLoopHandler);
  registerHandler(symbols::poptorch::start_if_block, startIfBlockHandler);
  registerHandler(symbols::poptorch::start_else_block, startElseBlockHandler);
  registerHandler(symbols::poptorch::end_if_block, endIfBlockHandler);
  registerHandler(symbols::poptorch::update_param_inplace,
                  updateParamInplaceHandler);
}

} // namespace poptorch


================================================
FILE: poptorch/source/popart_canonicalization/PyGTorchScatterOps.cpp
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
#include "../PoptorchStaticInit.hpp"
#include "../PoptorchSymbols.hpp"
#include "PopartCanonicalizationUtils.hpp"

#include "ScatterReduction.hpp"

#include "poptorch/OpBuilder.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

namespace poptorch {
namespace {
std::int32_t getReductionMethod(const torch::jit::Node *node) {
  const auto kind = node->kind();

  if (kind == torch_scatter::scatter_max) {
    return static_cast<std::int32_t>(ScatterReduction::Max);
  }
  if (kind == torch_scatter::scatter_min) {
    return static_cast<std::int32_t>(ScatterReduction::Min);
  }
  if (kind == torch_scatter::scatter_mul) {
    return static_cast<std::int32_t>(ScatterReduction::Mul);
  }

  ERROR("Unsupported reduction for node: " << nodeToString(node));
}

torch::jit::Node *torchScatterHandler(torch::jit::Graph *graph,
                                      torch::jit::Node *node) {
  static constexpr bool enable_index_broadcast = true;

  // Signatures for scatter_max, scatter_min, scatter_mul:
  // (Tensor src, Tensor index, int dim, Tensor? out, int? dim_size)
  auto *src = node->input(0);
  auto *index = node->input(1);
  const auto src_type = src->type()->expect<c10::TensorType>();
  const auto axis = handleDimensionParam(node->input(2), src_type);

  auto *opt_out = node->input(3);

  std::vector<torch::jit::Value *> args{src, index};
  if (!isNone(opt_out)) {
    args.push_back(opt_out);
  }

  auto shape = shapeFromTensor(node->output(0));
  auto axis_size = shape.at(axis);

  auto *opt_axis_size = node->input(4);
  if (!isNone(opt_axis_size)) {
    axis_size = constantToInt(opt_axis_size->node());
  }

  auto *result =
      createScatterreduce(graph, args, axis_size, axis, enable_index_broadcast,
                          getReductionMethod(node));

  if (node->outputs().size() == 1) {
    return result;
  }

  // Both scatter_max and scatter_min return two outputs where the second one
  // is the index but most often this second output is simply ignored.
  if (!node->output(1)->hasUses()) {
    // the indices output is unused so is safe to delete
    node->eraseOutput(1);
    return result;
  }

  // Calculate the indices of the max/min
  const auto ishape = shapeFromTensor(src);
  std::vector<int64_t> index_range_shape(ishape.size(), 1);
  index_range_shape[axis] = ishape[axis];

  const auto gather_handler = getHandler(c10::aten::gather);
  result->output()->setType(src_type->withSizes(shape));
  auto *gather =
      createHandlerOperation(graph, gather_handler,
                             {result->output(), node->input(2), index})
          ->output();

  // true if the scatter chose this location in src, false if we didn't
  auto *mask = createEqual(graph, {gather, src})->output();
  std::vector<std::int64_t> vals(ishape[axis]);
  std::iota(std::begin(vals), std::end(vals), 1);
  auto *index_range =
      createConstantInt(graph, vals, index_range_shape)->output();
  auto *not_chosen =
      createConstantInt(graph, {ishape[axis] + 1}, {1})->output();
  // The 1-based index in src if this location was chosen, ishape[axis] + 1 if
  // it wasn't
  auto *index_of_result =
      createWhere(graph, {mask, index_range, not_chosen})->output();
  // Apply the same scattering to our index tensor as we did to the input tensor
  static constexpr std::int32_t min_reduce =
      static_cast<std::int32_t>(ScatterReduction::Min);
  auto *arg_scatter =
      createScatterreduce(graph, {index_of_result, index}, axis_size, axis,
                          enable_index_broadcast, min_reduce)
          ->output();
  // Now we've got a tensor of 1-based indices, with zeroes where no index
  // was scattered. We need to transform this to zero-based indices, with
  // ishape[axis] where no index was scattered.
  auto *one = createConstantInt(graph, {1}, {1})->output();
  arg_scatter = createSub(graph, {arg_scatter, one})->output();
  arg_scatter = createRemainder(graph, {arg_scatter, not_chosen})->output();

  replaceOutputUse(node->output(0), result->output());
  replaceOutputUse(node->output(1), arg_scatter);
  markNodeForDeletion(node);
  return result;
}

} // namespace

__attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() {
  registerHandler(torch_scatter::scatter_max, torchScatterHandler);
  registerHandler(torch_scatter::scatter_min, torchScatterHandler);
  registerHandler(torch_scatter::scatter_mul, torchScatterHandler);
}

} // namespace poptorch


================================================
FILE: poptorch/source/popart_canonicalization/PyGTorchSplineConvOps.cpp
================================================
// Copyright (c) 2023 Graphcore Ltd. All rights reserved.
#include "../PoptorchStaticInit.hpp"
#include "../PoptorchSymbols.hpp"
#include "PopartCanonicalizationUtils.hpp"

#include "poptorch/OpBuilder.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

namespace poptorch {
namespace {

torch::jit::Node *torchSplineBasisHandler(torch::jit::Graph *graph,
                                          torch::jit::Node *node) {
  // Signatures for spline_basis
  // (Tensor pseudo, Tensor kernelSize, Tensor isOpenSpline, int degree)

  const std::vector<torch::jit::Value *> args{node->input(0), node->input(1),
                                              node->input(2)};
  const std::int32_t degree = constantToInt(node->input(3)->node());

  auto *result = createSplinebasis(graph, args, degree);

  return result;
}

torch::jit::Node *torchSplineWeightingHandler(torch::jit::Graph *graph,
                                              torch::jit::Node *node) {
  // Signatures for spline_weighting
  // (Tensor input, Tensor weight, Tensor basis, Tensor weightIndex)

  const std::vector<torch::jit::Value *> args{node->input(0), node->input(1),
                                              node->input(2), node->input(3)};

  auto *result = createSplineweighting(graph, args);

  return result;
}

} // namespace

__attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() {
  registerHandler(torch_spline_conv::spline_basis, torchSplineBasisHandler);
  registerHandler(torch_spline_conv::spline_weighting,
                  torchSplineWeightingHandler);
}

} // namespace poptorch


================================================
FILE: poptorch/source/popart_canonicalization/RNNOps.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#include "../PoptorchStaticInit.hpp"
#include "PopartCanonicalizationUtils.hpp"

#include "poptorch/OpBuilder.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"

namespace poptorch {
namespace {

torch::jit::Value *prependDimension(torch::jit::Graph *graph,
                                    torch::jit::Value *tensor) {
  auto shape = shapeFromTensor(tensor);
  shape.insert(shape.begin(), 1);
  return createReshape(graph, tensor, shape)->output();
}

torch::jit::Value *reshapeWeights(torch::jit::Graph *graph,
                                  torch::jit::Value *tensor,
                                  int64_t slice_size, bool transpose = false,
                                  bool swap = true) {
  std::vector<torch::jit::Value *> slices;
  unsigned num_slices = 3;

  torch::jit::Node *split = createSplit(graph, {tensor}, num_slices, 1,
                                        {slice_size, slice_size, slice_size});

  for (unsigned i = 0; i < num_slices; ++i) {
    torch::jit::Value *transposed = split->output(i);
    if (transpose) {
      transposed = createTranspose(graph, {transposed}, {0, 2, 1})->output();
    }
    slices.push_back(transposed);
  }

  if (swap) {
    std::swap(slices[0], slices[1]);
  }

  torch::jit::Node *concat = createConcat(graph, slices, 1);
  return concat->output();
}

torch::jit::Node *gruHandler(torch::jit::Graph *graph, torch::jit::Node *node) {
  auto *input = node->input(0);
  auto *hx = node->input(1);
  auto params = node->input(2)->node()->inputs();

  bool bias = constantToBool(node->input(3)->node());
  int num_layers = constantToLong(node->input(4)->node());
  float dropout = constantToFloat(node->input(5)->node());
  bool bidirectional = constantToBool(node->input(7)->node());
  bool batch_first = constantToBool(node->input(8)->node());

  ERROR_ON_MSG(num_layers != 1, "Only GRU with 1 layer supported");
  ERROR_ON_MSG(dropout != 0.0f, "GRU only supports dropout = 0.0");
  ERROR_ON_MSG(bidirectional, "bidirectional GRU not supported");

  auto *gate_weights = prependDimension(graph, params[0]);
  auto *recur_weights = prependDimension(graph, params[1]);

  auto input_shape = shapeFromTensor(input);
  unsigned seq_length = input_shape[0];
  unsigned batch_size = input_shape[1];

  auto recur_shape = shapeFromTensor(recur_weights);
  int64_t hidden_size = recur_shape[2];

  gate_weights = reshapeWeights(graph, gate_weights, hidden_size);
  recur_weights = reshapeWeights(graph, recur_weights, hidden_size);
  torch::jit::Value *biases;

  if (bias) {
    auto *gate_biases = prependDimension(graph, params[2]);
    auto *recur_biases = prependDimension(graph, params[3]);

    gate_biases = reshapeWeights(graph, gate_biases, hidden_size);
    recur_biases = reshapeWeights(graph, recur_biases, hidden_size);

    biases = createConcat(graph, {gate_biases, recur_biases}, 1)->output();
  } else {
    biases = createConstantFloatLike(graph, input, {0.}, {1l, 6l * hidden_size})
                 ->output();
  }

  // TODO(T54563)
  auto *seq_lens =
      createConstantInt(graph, {seq_length}, {batch_size})->output();

  if (batch_first) {
    input = createTranspose(graph, {input}, {1, 0, 2})->output();
  }

  auto *gru = createGru(
      graph, {input, gate_weights, recur_weights, biases, seq_lens, hx},
      hidden_size);

  auto *output = createSqueeze(graph, {gru->output(0)}, {1})->output();

  if (batch_first) {
    output = createTranspose(graph, {output}, {1, 0, 2})->output();
  }

  replaceOutputUse(node->output(0), output);
  replaceOutputUse(node->output(1), gru->output(1));

  markNodeForDeletion(node);
  return nullptr;
}

torch::jit::Node *lstmHandler(torch::jit::Graph *graph,
                              torch::jit::Node *node) {
  // aten::lstm(Tensor self, Tensor[] hx, Tensor[] weights, bool bias,
  // int num_layers, float dropout, bool training, bool bidirectional,
  // bool batch_first) -> Tensor, (Tensor, Tensor)

  torch::jit::Value *input = node->input(0);

  torch::jit::ArrayRef<torch::jit::Value *> hidden_layers =
      node->input(1)->node()->inputs();
  torch::jit::ArrayRef<torch::jit::Value *> weights_list =
      node->input(2)->node()->inputs();

  bool use_bias = constantToBool(node->input(3)->node());
  ERROR_ON_MSG(!use_bias, "LSTM without biases not supported");

  std::int64_t num_layers = constantToLong(node->input(4)->node());
  ERROR_ON_MSG(num_layers != 1, "Only LSTM with 1 layer supported");

  float dropout = constantToFloat(node->input(5)->node());
  ERROR_ON_MSG(dropout != 0.0f, "LSTM only supports dropout = 0.0");

  bool bidirectional = constantToBool(node->input(7)->node());
  ERROR_ON_MSG(bidirectional, "bidirectional LSTM not supported");

  bool batch_first = constantToBool(node->input(8)->node());

  // An LSTM state is made of 4 values
  constexpr std::uint64_t state_size = 4;
  const std::int64_t num_weights =
      *weights_list[0]->type()->expect<c10::TensorType>()->sizes()[0];
  ERROR_ON(num_weights % state_size != 0);
  const std::int64_t num_hidden_layers = num_weights / state_size;

  // def reshape_weights(onnx_weights):
  //    ws = builder.aiOnnx.split([w], 4, 1, [hidden_size] * 4)
  //    ws = [builder.aiOnnx.transpose([i], [0, 2, 1]) for i in ws]
  //    ws = builder.aiOnnx.concat([ws[i] for i in (2, 0, 3, 1)], 0)
  //    return ws
  //
  // Note: onnx weights are in IOFC order while Torch uses IFCO
  //
  // Biases don't need to be transposed
  auto reshape_tensor = [&](torch::jit::Value *values, bool areWeights) {
    const std::uint64_t num_dims_without_batch = areWeights ? 2 : 1;
    std::vector<std::int64_t> shape = shapeFromTensor(values);
    if (shape.size() == num_dims_without_batch) {
      // Add a batch dimension
      shape.insert(shape.begin(), 1);
      torch::jit::Node *reshape = createReshape(graph, values, shape);
      values = reshape->output();
    }
    torch::jit::Node *states =
        createSplit(graph, {values}, state_size, 1,
                    {num_hidden_layers, num_hidden_layers, num_hidden_layers,
                     num_hidden_layers});
    std::vector<torch::jit::Value *> slices;
    for (std::uint64_t i = 0; i < state_size; ++i) {
      if (areWeights) {
        // Weights also need to be transposed
        torch::jit::Node *transposed =
            createTranspose(graph, {states->output(i)}, {0, 2, 1});
        slices.push_back(transposed->output());
      } else {
        slices.push_back(states->output(i));
      }
    }
    torch::jit::Node *concat =
        createConcat(graph, {slices[1], slices[0], slices[2], slices[3]}, 0);
    return concat->output();
  };

  torch::jit::Node *concat_weights =
      createConcat(graph,
                   {reshape_tensor(weights_list[0], true),
                    reshape_tensor(weights_list[1], true)},
                   1);

  torch::jit::Node *combine_biases =
      createAddNotInPlace(graph, reshape_tensor(weights_list[2], false),
                          reshape_tensor(weights_list[3], false));

  torch::jit::Node *concat_states =
      createConcat(graph, {hidden_layers[0], hidden_layers[1]}, 0);

  std::vector<std::int64_t> input_shape = shapeFromTensor(input);
  std::int64_t batch_dim = 0;
  // Transpose output BSF -> SBF
  if (batch_first) {
    torch::jit::Node *transpose = createTranspose(graph, {input}, {1, 0, 2});
    input = transpose->output();
    batch_dim = 1;
  }
  std::vector<torch::jit::Value *> args;
  args.push_back(input);
  args.push_back(concat_weights->output()); // input weights + output_weights
  args.push_back(combine_biases->output()); // biases
  args.push_back(concat_states->output());  // init_states

  torch::jit::Node *lstm = createLstm(graph, args, 1);

  // Keep the last slice from Y `[seq_length, num_directions, batch_size,
  // hidden_size]
  torch::jit::Node *y_h = createSlice(graph, {lstm->output(0)}, {INT_MAX},
                                      {input_shape[batch_dim] - 1}, {0});

  torch::jit::Value *output = lstm->output(0);
  // Transpose output SBF -> BSF
  if (batch_first) {
    torch::jit::Node *transpose = createTranspose(graph, {output}, {1, 0, 2});
    output = transpose->output();
  }

  // The shape of y_c returned by PopART has shape (batch_size, hidden_size).
  // Torch's c_n output has shape
  // (num_directions * num_layers, batch_size, hidden_size), but since we don't
  // support bidirectional or > 1 layers, this dimension is always 1 so we just
  // need to prepend a single dim
  auto *y_c = createUnsqueeze(graph, {lstm->output(1)}, {0});

  ERROR_ON(node->outputs().size() != 3);
  if (node->hasUses()) {
    replaceOutputUse(node->output(0), output);
    replaceOutputUse(node->output(1), y_h->output());
    replaceOutputUse(node->output(2), y_c->output());
  }

  markNodeForDeletion(node);
  return nullptr;
}

torch::jit::Node *rnnHandler(torch::jit::Graph *graph, torch::jit::Node *node,
                             const std::string &nonlinearity) {
  // rnn_{tanh/relu}.input(Tensor input, Tensor hx, Tensor[] params,
  // bool has_biases, int num_layers, float dropout, bool train,
  // bool bidirectional, bool batch_first) -> (Tensor, Tensor)
  torch::jit::Value *input = node->input(0);
  torch::jit::Value *hx = node->input(1);

  torch::jit::ArrayRef<torch::jit::Value *> params =
      node->input(2)->node()->inputs();
  torch::jit::Value *w_ih = params[0]; // input-hidden weights
  torch::jit::Value *w_hh = params[1]; // hidden-hidden weights
  torch::jit::Value *b_ih = params[2]; // input-hidden bias
  torch::jit::Value *b_hh = params[3]; // hidden-hidden bias

  bool has_biases = constantToBool(node->input(3)->node());
  ERROR_ON_MSG(!has_biases, "RNN without biases is not supported");

  int num_layers = constantToInt(node->input(4)->node());
  ERROR_ON_MSG(num_layers != 1, "Only RNN with 1 layer is supported");

  float dropout = constantToFloat(node->input(5)->node());
  ERROR_ON_MSG(dropout != 0.0f, "RNN only supports dropout = 0.0");

  bool bidirectional = constantToBool(node->input(7)->node());
  ERROR_ON_MSG(bidirectional, "Bidirectional RNN is not supported");

  bool batch_first = constantToBool(node->input(8)->node());

  auto input_shape = shapeFromTensor(input);
  int64_t sequence_length;
  int64_t batch_size;

  if (batch_first) {
    // N, L, H_in -> L, N, H_in
    input = createTranspose(graph, {input}, {1, 0, 2})->output();
    sequence_length = input_shape.at(1);
    batch_size = input_shape.at(0);
  } else {
    sequence_length = input_shape.at(0);
    batch_size = input_shape.at(1);
  }

  auto *b = createConcat(graph, {b_ih, b_hh}, 0)->output();
  // Fix concat result shape so that we can use prependDimension().
  auto hidden_size = shapeFromTensor(b_ih).front();
  b->setType(
      b_ih->type()->expect<c10::TensorType>()->withSizes({2 * hidden_size}));

  // TODO(T54563)
  auto *sequence_lens =
      createConstantInt(graph, {sequence_length}, {batch_size})->output();

  std::vector<torch::jit::Value *> args = {
      // [seq_length, batch_size, input_size]
      input,
      // [num_directions, hidden_size, input_size]
      prependDimension(graph, w_ih),
      // [num_directions, hidden_size, hidden_size]
      prependDimension(graph, w_hh),
      // [num_directions, 2*hidden_size]
      prependDimension(graph, b),
      // [batch_size]
      sequence_lens,
      // [num_directions, batch_size, hidden_size]
      hx,
  };

  auto *rnn = createRnn(graph, args, {nonlinearity});
  auto *output_0 = createReshape(graph, rnn->output(0),
                                 {sequence_length, batch_size, hidden_size})
                       ->output();

  if (batch_first) {
    // L, N, H_out -> N, L, H_out
    output_0 = createTranspose(graph, {output_0}, {1, 0, 2})->output();
  }

  replaceOutputUse(node->output(0), output_0);
  replaceOutputUse(node->output(1), rnn->output(1));
  markNodeForDeletion(node);

  return nullptr;
}

torch::jit::Node *rnnTanhHandler(torch::jit::Graph *graph,
                                 torch::jit::Node *node) {
  return rnnHandler(graph, node, "Tanh");
}

torch::jit::Node *rnnReluHandler(torch::jit::Graph *graph,
                                 torch::jit::Node *node) {
  return rnnHandler(graph, node, "Relu");
}

} // namespace

__attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() {
  registerHandler(c10::aten::gru, gruHandler);
  registerHandler(c10::aten::lstm, lstmHandler);
  registerHandler(c10::aten::rnn_tanh, rnnTanhHandler);
  registerHandler(c10::aten::rnn_relu, rnnReluHandler);
}

} // namespace poptorch


================================================
FILE: poptorch/source/popart_canonicalization/RandomSamplingOps.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#include <limits>

#include "PopartCanonicalizationUtils.hpp"

#include "../PoptorchStaticInit.hpp"
#include "../PoptorchSymbols.hpp"
#include "poptorch/OpBuilder.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"

namespace poptorch {
namespace {

torch::jit::Node *normalHandler(torch::jit::Graph *graph,
                                torch::jit::Node *node) {
  // Overloads for aten::normal
  // 1) both mean and std are scalar floats
  //   aten::normal(float mean, float std, int[] size, Generator?, int? dtype,
  //   int? layout, Device? device, bool? pin_memory) -> Tensor
  //
  // 2) mean is a tensor and std is a scalar
  //   aten::normal(Tensor mean, float std, Tensor? out)
  //
  // 3) mean is a scalar and std is a tensor
  //   aten::normal(float mean, Tensor std, Tensor? out)
  //
  // 4) both mean and std are tensors
  //   aten::normal(Tensor mean, Tensor std, Tensor? out)
  torch::jit::Value *mean = node->input(0);
  torch::jit::Value *std = node->input(1);
  std::vector<int64_t> shape = shapeFromTensor(node->output());

  bool mean_scalar = isConstantScalar(mean);
  bool std_scalar = isConstantScalar(std);
  if (mean_scalar && std_scalar) {
    // Both mean and std are scalar constant floats
    float mean_constant = constantToFloat(mean->node());
    float std_constant = constantToFloat(std->node());

    return createRandomNormal(graph, {mean, std}, shape, mean_constant,
                              std_constant);
  }

  // One or both of mean/std inputs must be tensors.  Generate the output tensor
  // of random numbers drawn from separate normal distribution whose mean and
  // std are given as tensors using the following transform:
  //
  //   normal(mean=0, std=1) * std + mean
  //
  // Broadcasting will take care of expanding any scalars to the correct shape.
  // Use {mean} to identify the type only
  auto mean_type = getNodeScalarType(mean);
  auto std_type = getNodeScalarType(std);
  if (mean_type != std_type) {
    if (mean_scalar && !std_scalar) {
      mean = createCast(graph, mean, std_type)->output();
    }
    if (!mean_scalar && std_scalar) {
      std = createCast(graph, std, mean_type)->output();
    }
  }
  torch::jit::Node *normal =
      createRandomNormal(graph, {mean, std}, shape, 0.0f, 1.0f);
  torch::jit::Node *mul = poptorch::createMul(graph, {normal->output(), std});
  return poptorch::createAdd(graph, {mul->output(), mean});
}

torch::jit::Node *bernoulliHandler(torch::jit::Graph *graph,
                                   torch::jit::Node *node) {
  // aten::bernoulli(Tensor self, float? probability)
  // Check for scalar probability
  torch::jit::Value *prob = node->input(1);

  if (isNone(prob)) {
    // probabilities passed as input tensor
    prob = node->input(0);
  }

  std::vector<int64_t> shape = shapeFromTensor(node->output());
  c10::ScalarType dtype = getNodeScalarType(node->input(0));

  torch::jit::Value *uniform =
      createRandomUniform(graph, nullptr, shape, 1.0, 0.0, dtype)->output();

  torch::jit::Value *lt = createLess(graph, {uniform, prob})->output();
  return createCast(graph, lt, dtype);
}

torch::jit::Node *exponentialHandler(torch::jit::Graph *graph,
                                     torch::jit::Node *node) {
  // aten::exponential_(Tensor self, double lambda)
  torch::jit::Value *self = node->input(0);
  torch::jit::Value *lambda = node->input(1);
  torch::jit::Value *output = node->output();

  std::vector<int64_t> shape = shapeFromTensor(output);
  c10::ScalarType dtype = getNodeScalarType(self);
  c10::ScalarType dtype_rng = c10::ScalarType::Float;

  // Use smallest non-zero value to prevent the posibility of
  // log(0) with minimal bias on the sampling distribution
  float low = std::numeric_limits<float>::min();
  torch::jit::Value *x =
      createRandomUniform(graph, nullptr, shape, 1.0, low, dtype_rng)->output();

  auto *log_x = createLog(graph, {x})->output();
  auto *neg_log_x = createNeg(graph, {log_x})->output();
  auto *exponential = createDiv(graph, {neg_log_x, lambda})->output();
  return createCast(graph, exponential, dtype);
}

} // namespace

__attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() {
  registerHandler(c10::aten::normal, normalHandler);
  registerHandler(c10::aten::bernoulli, bernoulliHandler);
  registerHandler(c10::aten::exponential_, exponentialHandler);
}

} // namespace poptorch


================================================
FILE: poptorch/source/popart_canonicalization/ReduceOps.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#include <limits>

#include "../PoptorchStaticInit.hpp"
#include "PopartCanonicalizationUtils.hpp"

#include "poptorch/OpBuilder.hpp"
#include "poptorch_logging/Error.hpp"

#include "../PoptorchSymbols.hpp"
#include "poptorch/Utils.hpp"

#include <ATen/ATen.h>

namespace poptorch {
namespace {

torch::jit::Node *reduceHandler(torch::jit::Graph *graph,
                                torch::jit::Node *node) {
  // Reductions have three overloads. The first is:
  // aten::mean(Tensor self, int[] dim, int keepdim, Tensor? out)) -> tensor

  // The second is:
  // aten::mean(Tensor self, int? dtype)) -> tensor

  // The third is for boolean reductions
  // aten::all(Tensor self) -> tensor

  torch::jit::Symbol const kind = node->kind();
  torch::jit::Value *input = node->input(0);

  // sum and prod works even for bool types in PyTorch
  const auto tensor_type = input->type()->expect<c10::TensorType>();
  if (tensor_type->scalarType() == at::ScalarType::Bool) {
    auto *cast_node = createCast(graph, input, c10::ScalarType::Int);
    input = cast_node->output();
  }

  std::vector<std::int64_t> axes{};
  std::int64_t keepdim = 0;

  // Case 2/3 or case 1 with no dimension specified.
  const size_t case_2_3 =
      (kind == c10::aten::any || kind == c10::aten::all) ? 1 : 2;
  bool flatten = node->inputs().size() == case_2_3;
  if (!flatten) {
    // Case 1.
    // Sometimes the dimensions are just one int.

    if (node->input(1)->node()->kind() == symbols::poptorch::tensor_constant) {
      axes.push_back(constantToLong(node->input(1)->node()));
    } else {
      axes = constantToLongVec(node->input(1)->node());
      // No dimension specified: this is actually a case 1.
      if (axes.empty()) {
        flatten = true;
      }
    }
    keepdim = constantToLong(node->input(2)->node());
  }
  if (flatten) {
    // Need to use reshape as "Flatten" is for 2D output
    auto numels_optional = tensor_type->numel();
    ERROR_ON(!numels_optional);
    input =
        createReshape(graph, input, {static_cast<int64_t>(*numels_optional)})
            ->output();
    axes = {0};
    keepdim = 0;
  }

  // Output the correct reduction.
  if (kind == c10::aten::prod) {
    return createReduceprod(graph, {input}, axes, keepdim);
  }
  if (kind == c10::aten::mean) {
    return createReducemean(graph, {input}, axes, keepdim);
  }
  if (kind == c10::aten::sum) {
    return createReducesum(graph, {input}, axes, keepdim);
  }
  if (kind == c10::aten::logsumexp) {
    return createReducelogsumexp(graph, {input}, axes, keepdim);
  }
  if (kind == c10::aten::all) {
    auto *t0 = createAbs(graph, {input})->output();
    auto *t1 = createReducemin(graph, {t0}, axes, keepdim)->output();
    return createCast(graph, t1, at::ScalarType::Bool);
  }
  if (kind == c10::aten::any) {
    auto *t0 = createAbs(graph, {input})->output();
    auto *t1 = createReducemax(graph, {t0}, axes, keepdim)->output();
    return createCast(graph, t1, at::ScalarType::Bool);
  }
  ERROR("Popart Canonicalisation: UNREACHABLE reached in reductions.");
}

torch::jit::Node *reduceMedianHandler(torch::jit::Graph *graph,
                                      torch::jit::Node *node) {
  auto *input = node->input(0);
  std::vector<std::int64_t> axes;
  std::int64_t keepdim = 0;

  torch::jit::Node *output;

  if (node->inputs().size() == 1) {
    // aten::median(Tensor self) -> Tensor
    axes = reduceHelperDimensionCreator(input);
    auto *reduced = createReducemedian(graph, {input}, axes, keepdim);
    reduced->eraseOutput(1);
    output = reduced;
  } else {
    // aten::median(Tensor self, int dim, bool keepdim)
    //             -> (Tensor values, Tensor indices)
    axes.push_back(constantToLong(node->input(1)->node()));
    keepdim = constantToLong(node->input(2)->node());
    output = createReducemedian(graph, {input}, axes, keepdim);
  }

  return output;
}

torch::jit::Node *aMinMaxHandler(torch::jit::Graph *graph,
                                 torch::jit::Node *node) {
  // aten::max(Tensor self, int[] dim, int keepdim)
  // aten::min(Tensor self, int[] dim, int keepdim)
  auto *input = node->input(0);
  auto axes = constantToLongVec(node->input(1)->node());
  const auto keepdim = constantToLong(node->input(2)->node());

  if (axes.empty()) {
    input = createFlatten(graph, {input}, 0)->output();
    axes = {1};
  }

  if (node->kind() == c10::aten::amax) {
    return createReducemax(graph, {input}, axes, keepdim);
  }
  return createReducemin(graph, {input}, axes, keepdim);
}

torch::jit::Node *argMinMaxHandler(torch::jit::Graph *graph,
                                   torch::jit::Node *node) {
  //  aten::argmin(Tensor in, int? dim, int keep_dims) -> Tensor
  //  aten::argmax(Tensor in, int? dim, int keep_dims) -> Tensor
  // dim (int) - the dimension to reduce. If None, the argmax
  //             of the flattened input is returned.

  torch::jit::Symbol const kind = node->kind();
  torch::jit::Value *input = node->input(0);

  std::optional<std::int64_t> dim;
  if (node->input(1)->node()->kind() == symbols::poptorch::tensor_constant) {
    dim = constantToLong(node->input(1)->node());
  }

  std::int64_t const keep_dim = constantToLong(node->input(2)->node());

  // If dim is not provided we will flatten input so just use 0 in that
  // case.
  std::int64_t dim_to_use = 1;

  // Check if dim is NONE.
  if (!dim) {
    torch::jit::Node *flatten = createFlatten(graph, {node->input(0)}, 0);
    input = flatten->output();
  } else {
    dim_to_use = *dim;
  }

  torch::jit::Node *indices;
  // Create the actual argmax/argmin.
  if (kind == c10::aten::argmax) {
    indices = createArgmax(graph, {input}, dim_to_use, keep_dim);
  } else {
    indices = createArgmin(graph, {input}, dim_to_use, keep_dim);
  }
  // Note: these ops return int64, so we need to cast them to int
  return createCast(graph, indices->output(), c10::ScalarType::Int);
}

torch::jit::Node *argsortHandler(torch::jit::Graph *graph,
                                 torch::jit::Node *node) {
  auto *x = node->input(0);
  auto t0 = x->type()->expect<c10::TensorType>();
  const std::vector<std::int64_t> shape = shapeFromTensor(node->input(0));
  const auto dim = handleDimensionParam(node->input(1), t0);

  auto *size = createConstantLong(graph, {shape[dim]}, {1})->output();

  auto *topk =
      createTopk(graph, {x, size}, dim, true /*largest*/, true /*sorted*/);
  auto *indices = topk->output(1);

  // Onnx will output the indices long, so use a cast to revert the type.
  // PopART will remove it as an identity when topk resolves to output an int.
  indices = createCast(graph, indices, c10::ScalarType::Int)->output();

  const auto descending = constantToBool(node->input(2)->node());
  if (descending) {
    return indices->node();
  }

  const std::vector<int64_t> dims{dim};
  return createReverse(graph, {indices}, dims);
}

torch::jit::Node *minMaxWithIndicesHandler(torch::jit::Graph *graph,
                                           torch::jit::Node *node) {
  auto *x = node->input(0);
  auto t0 = x->type()->expect<c10::TensorType>();
  const std::vector<std::int64_t> shape = shapeFromTensor(x);
  torch::jit::Value *values;
  torch::jit::Value *indices;
  if (shape.empty()) {
    values = createIdentity(graph, {x})->output();
    indices = createConstantInt(graph, {0}, {})->output();
  } else {
    const auto dim = handleDimensionParam(node->input(1), t0);
    const auto keepdim = constantToBool(node->input(2)->node());
    const bool negate = node->kind() == c10::aten::min;

    if (negate) {
      x = createNeg(graph, {x})->output();
    }

    auto *one = tensorToConstant(graph, at::tensor(1L))->output();
    auto *result =
        createTopk(graph, {x, one}, dim, true /*largest*/, true /*sorted*/);
    values = result->output(0);
    indices = result->output(1);
    // TopK returns UINT32 indices, but torch doesn't have unsigned
    // 32 bit integer tensor types so we need to cast back to INT32
    indices = createCast(graph, indices, c10::ScalarType::Int)->output();

    if (negate) {
      values = createNeg(graph, {values})->output();
    }

    if (!keepdim) {
      // Squeeze out the singleton-dim left by topk
      values = createSqueeze(graph, {values}, {dim})->output();
      indices = createSqueeze(graph, {indices}, {dim})->output();
    }
  }
  replaceOutputUse(node->output(0), values);
  replaceOutputUse(node->output(1), indices);

  markNodeForDeletion(node);
  return nullptr;
}

template <typename ReduceFunc, typename ExtremaFunc>
torch::jit::Node *minMaxHandler(torch::jit::Graph *graph,
                                torch::jit::Node *node, ReduceFunc &&reduceFunc,
                                ExtremaFunc &&extremaFunc) {
  if (node->inputs().size() == 1) {
    auto *x = node->input(0);
    auto t0 = reduceHelperDimensionCreator(x);
    return reduceFunc(graph, {x}, t0, 0);
  }
  if (node->inputs().size() == 2) {
    auto *i0 = node->input(0);
    auto *i1 = node->input(1);
    return extremaFunc(graph, {i0, i1});
  }

  return minMaxWithIndicesHandler(graph, node);
}

torch::jit::Node *minHandler(torch::jit::Graph *graph, torch::jit::Node *node) {
  return minMaxHandler(graph, node, createReducemin, createMin);
}

torch::jit::Node *maxHandler(torch::jit::Graph *graph, torch::jit::Node *node) {
  return minMaxHandler(graph, node, createReducemax, createMax);
}

torch::jit::Node *tensorNormHandler(torch::jit::Graph *graph,
                                    torch::jit::Node *node) {
  // aten::norm(Tensor in, int p) -> Tensor
  // aten::norm(Tensor in, float p) -> Tensor
  // aten::norm(Tensor in, int p, int[] dim, int keepdim) -> Tensor
  // aten::norm(Tensor in, float p, int[] dim, int keepdim) -> Tensor

  // aten::norm(Tensor self, Scalar? p, int[1] dim, bool keepdim, *,
  //            ScalarType dtype) -> Tensor
  // aten::norm(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *,
  //            ScalarType dtype) -> Tensor
  // aten::norm(Tensor self, Scalar? p, int[1] dim, bool keepdim=False, *,
  //            Tensor(a!) out) -> Tensor(a!)
  // aten::norm(Tensor self, Scalar? p, int[1] dim, bool keepdim=False, *,
  //            Tensor(a!) out) -> Tensor(a!)
  // aten::norm(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False, *,
  //            Tensor(a!) out) -> Tensor(a!)
  // aten::norm(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False, *,
  //            Tensor(a!) out) -> Tensor(a!)
  //
  // aten::norm(Tensor self, Scalar? p, int[1] dim, bool keepdim, *,
  //            ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
  // aten::norm(Tensor self, Scalar? p, int[1] dim, bool keepdim, *,
  //            ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
  // aten::norm(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *,
  //            ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
  // aten::norm(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *,
  //            ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
  //
  // aten::linalg_norm(Tensor self, Scalar? ord=None, int[1]? dim=None,
  //                    bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
  // aten::linalg_norm(Tensor self, str ord, int[1]? dim=None,
  //                   bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
  // aten::linalg_norm(Tensor self, Scalar? ord=None, int[1]? dim=None,
  //                   bool keepdim=False, *, ScalarType? dtype=None,
  //                   Tensor(a!) out) -> Tensor(a!)
  // aten::linalg_norm(Tensor self, Scalar? ord=None, int[1]? dim=None,
  //                   bool keepdim=False, *, ScalarType? dtype=None,
  //                   Tensor(a!) out) -> Tensor(a!)
  // aten::linalg_norm(Tensor self, str ord, int[1]? dim=None,
  //                   bool keepdim=False, *, ScalarType? dtype=None,
  //                   Tensor(a!) out) -> Tensor(a!)
  // aten::linalg_norm(Tensor self, str ord, int[1]? dim=None,
  //                   bool keepdim=False, *, ScalarType? dtype=None,
  //                   Tensor(a!) out) -> Tensor(a!)

  torch::jit::Value *input = node->input(0);
  torch::jit::Value *p_val = node->input(1);

  std::vector<std::int64_t> axes{};
  std::int64_t keepdim = 0;

  if (node->inputs().size() == 2) {
    torch::jit::Node *flatten = createFlatten(graph, {input}, 0);
    input = flatten->output();
    axes = {1};
  } else {
    auto *axes_val = node->input(2);
    if (!isNone(axes_val)) {
      axes = constantToLongVec(node->input(2)->node());
    }
    keepdim = constantToLong(node->input(3)->node());
    const auto shape = shapeFromTensor(input);
    // Empty axes array means reduce over all axes in PyTorch, but means
    // do nothing in PopART
    if (axes.empty()) {
      axes.resize(shape.size());
      std::iota(std::begin(axes), std::end(axes), 0);
    }
    // handle optional dtype
    if (node->inputs().size() >= 5) {
      auto *input_4 = node->input(4);
      const bool is_scalar_type =
          input_4->type()->kind() == c10::TypeKind::ScalarTypeType;
      if (is_scalar_type) {
        if (auto *opt_dtype = input_4; opt_dtype->mustNotBeNone()) {
          const auto &opt_dtype_tensors =
              opt_dtype->node()->ts(c10::attr::value);
          ERROR_ON(opt_dtype_tensors.empty());
          if (!opt_dtype_tensors.front().is_floating_point()) {
            input = createCast(graph, input,
                               constantToScalarType(opt_dtype->node()))
                        ->output();
          }
        }
      }
    }
    // If we're reducing over singleton dims and keeping them, the
    // behaviour of PopART reduce ops is to do nothing, but PyTorch will
    // still take the absolute value of the tensor, so we need to
    // do the same
    if ((keepdim != 0) &&
        std::all_of(axes.begin(), axes.end(),
                    [&](std::int64_t i) { return shape[i] == 1; })) {
      return createAbs(graph, {input});
    }
  }

  constexpr float pos_inf = std::numeric_limits<float>::infinity();
  constexpr float neg_inf = -std::numeric_limits<float>::infinity();
  const float p = constantToFloat(node->input(1)->node());

  if (p == 1.0) {
    return createReducel1(graph, {input}, axes, keepdim);
  }
  if (p == 2.0) {
    return createReducel2(graph, {input}, axes, keepdim);
  }
  if (p == pos_inf || p == neg_inf) {
    // max/min(abs(x))
    torch::jit::Node *abs = createAbs(graph, {input});
    input = abs->output();

    if (p == pos_inf) {
      return createReducemax(graph, {input}, axes, keepdim);
    }
    return createReducemin(graph, {input}, axes, keepdim);
  }

  // sum(abs(x)**p)**(1./p)
  torch::jit::Node *abs = createAbs(graph, {input});

  torch::jit::Node *pow = createPow(graph, {abs->output(), p_val});
  torch::jit::Node *sum =
      createReducesum(graph, {pow->output()}, axes, keepdim);

  at::ScalarType const p_type = getNodeScalarType(p_val);

  if (p_type == c10::ScalarType::Int || p_type == c10::ScalarType::Long) {
    // Cast int to float before reciprocal
    torch::jit::Node *to_float = createCast(graph, p_val, c10::kFloat);
    p_val = to_float->output();
  }

  torch::jit::Node *one_over_p = createReciprocal(graph, {p_val});
  return createPow(graph, {sum->output(), one_over_p->output()});
}

torch::jit::Node *frobeniusnormHandler(torch::jit::Graph *graph,
                                       torch::jit::Node *node) {
  if (node->inputs().size() == 1) {
    auto *x = node->input(0);
    auto t0 = reduceHelperDimensionCreator(x);
    return createReducel2(graph, {x}, t0, 0);
  }
  if (node->inputs().size() == 3) {
    auto *x = node->input(0);
    auto *l = node->input(1);
    const auto t0 = constantToLongVec(l->node());
    const auto t1 = reduceHelperDimensionCreator(x, t0);
    auto *c = node->input(2);
    const auto t2 = constantToLong(c->node());
    const auto shape = shapeFromTensor(x);
    // If we're reducing over singleton dims and keeping them, the
    // behaviour of PopART reduce ops is to do nothing, but PyTorch will
    // still take the absolute value of the tensor, so we need to
    // do the same
    if ((t2 != 0) && std::all_of(t1.begin(), t1.end(), [&](std::int64_t i) {
          return shape[i] == 1;
        })) {
      return createAbs(graph, {x});
    }
    return createReducel2(graph, {x}, t1, t2);
  }

  ERROR("Incorrect number of arguments for operator "
        << "c10::aten::frobenius_norm. "
        << "Expecting 1 or 3 operands, "
        << "got " << node->inputs().size() << " operand(s).");
  return nullptr;
}

// count_nonzero.dim_IntList(Tensor self, int[] dim) -> Tensor
torch::jit::Node *countNonzeroHandler(torch::jit::Graph *graph,
                                      torch::jit::Node *node) {
  auto *self = node->input(0);
  auto dim = constantToLongVec(node->input(1)->node());
  if (dim.empty()) {
    dim = shapeFromTensor(self);
    std::iota(dim.begin(), dim.end(), 0);
  }

  auto *self_bool = self;
  if (getNodeScalarType(self) != c10::ScalarType::Bool) {
    self_bool = createCast(graph, self, c10::ScalarType::Bool)->output();
  }
  auto *where = createWhere(graph, {self_bool, wrapInConstant1D(graph, 1),
                                    wrapInConstant1D(graph, 0)});

  return createReducesum(graph, {where->output()}, dim, /*keepdims=*/0);
}

torch::jit::Node *nanSumHandler(torch::jit::Graph *graph,
                                torch::jit::Node *node) {
  // isNan -> where -> sum -> cast (if applicable) -> out
  torch::jit::Value *in_tensor = node->input(0);

  auto *is_nan = createIsnan(graph, {in_tensor});
  auto *zeros = createConstantFloatLike(graph, in_tensor, {0},
                                        shapeFromTensor(in_tensor));
  auto *non_nans =
      createWhere(graph, {is_nan->output(0), zeros->output(0), in_tensor});

  std::vector<int64_t> dims;
  auto *dim = node->input(1);
  if (auto *n = dim->node(); n->kind() == c10::prim::ListConstruct) {
    dims = constantToLongVec(n);
  } else if (isNone(dim)) {
    // We only get a node with Constant kind if `dim` is not
    // provided, so preform the sum over all the dimensions.
    const auto in_dim_count = shapeFromTensor(in_tensor).size();
    dims.resize(in_dim_count);
    std::iota(dims.begin(), dims.end(), 0);
  } else {
    ERROR("Popart Canonicalisation: UNREACHABLE reached in nansum handler.");
  }

  const auto keepdim = constantToLong(node->input(2)->node());
  auto *sum = createReducesum(graph, {non_nans->output(0)}, dims, keepdim);

  auto *dtype = node->input(3);
  if (!isNone(dtype)) {
    const auto type = constantToScalarType(dtype->node());
    return createCast(graph, sum->output(0), type);
  }
  return sum;
}

} // namespace

__attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() {
  registerHandler(c10::aten::amax, aMinMaxHandler);
  registerHandler(c10::aten::amin, aMinMaxHandler);
  registerHandler(c10::aten::argmax, argMinMaxHandler);
  registerHandler(c10::aten::argmin, argMinMaxHandler);
  registerHandler(c10::aten::argsort, argsortHandler);
  registerHandler(c10::aten::prod, reduceHandler);
  registerHandler(c10::aten::mean, reduceHandler);
  registerHandler(c10::aten::median, reduceMedianHandler);
  registerHandler(c10::aten::sum, reduceHandler);
  registerHandler(c10::aten::logsumexp, reduceHandler);
  registerHandler(c10::aten::norm, tensorNormHandler);
  registerHandler(c10::aten::linalg_vector_norm, tensorNormHandler);
  registerHandler(c10::aten::frobenius_norm, frobeniusnormHandler);
  registerHandler(c10::aten::min, minHandler);
  registerHandler(c10::aten::minimum, minHandler);
  registerHandler(c10::aten::max, maxHandler);
  registerHandler(c10::aten::maximum, maxHandler);
  registerHandler(c10::aten::any, reduceHandler);
  registerHandler(c10::aten::all, reduceHandler);
  registerHandler(c10::aten::count_nonzero, countNonzeroHandler);
  registerHandler(c10::aten::nansum, nanSumHandler);
}
} // namespace poptorch


================================================
FILE: poptorch/source/popart_canonicalization/ReshapeOps.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.

#include <torch/csrc/jit/ir/ir.h>

#include "../PoptorchStaticInit.hpp"
#include "PopartCanonicalizationUtils.hpp"

#include "ScatterReduction.hpp"

#include "poptorch/DispatchTracer.hpp"
#include "poptorch/OpBuilder.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

#include "../PoptorchSymbols.hpp"

#include <ATen/ATen.h>

namespace poptorch {
namespace {

torch::jit::Node *expandHandler(torch::jit::Graph *graph,
                                torch::jit::Node *node) {
  // aten::broadcast_to(Tensor(a) self, int[] size) -> Tensor(a)
  // aten::expand(Tensor self, int[] size)  -> Tensor
  // NB signature in source has an, apparently unused boolean:
  // aten::expand(Tensor(a) self, int[] size, *, bool implicit=False) ->
  // Tensor(a)
  torch::jit::Node *new_node;

  // Extract the type from the pytorch IR.
  c10::TensorTypePtr const self_tensor =
      node->input(0)->type()->expect<c10::TensorType>();
  c10::VaryingShape const self_dims = self_tensor->sizes();

  // Old shape
  const std::vector<std::int64_t> old_shape = shapeFromTensor(node->input(0));

  // Count the elems in the old shape.
  const std::int64_t old_elem_count = std::accumulate(
      old_shape.begin(), old_shape.end(), 1, std::multiplies<std::int64_t>());

  // Get the target size for the expand.
  std::vector<std::int64_t> new_shape =
      constantToLongVec(node->input(1)->node());

  ERROR_ON_MSG(new_shape.size() < old_shape.size(),
               "The desired shape passed to expand should have at least as "
               "many dimensions as the input tensor (required at least "
                   << old_shape.size() << ", got " << new_shape.size() << ")");

  // A new shape element of -1 means that dimension should not change
  for (size_t i = 0; i < old_shape.size(); i++) {
    // If you give more dimensions in the desired shape than there are in the
    // input tensor, they'll get *pre*pended -- so to turn the -1s into lengths
    // from the input, work backwards.
    const auto input_idx = old_shape.size() - (i + 1);
    const auto input_len = old_shape[input_idx];
    const auto desired_idx = new_shape.size() - (i + 1);
    const auto desired_len = new_shape[desired_idx];

    if (desired_len == -1) {
      new_shape[desired_idx] = input_len;
    } else if (desired_len != input_len && input_len != 1) {
      ERROR("Can only expand dimensions of size 1; however, trying "
            "to expand dimension "
            << input_idx << " of size " << input_len << " to " << desired_len);
    }
  }

  // Count the number of elements in the target shape.
  const std::int64_t new_elem_count = std::accumulate(
      new_shape.begin(), new_shape.end(), 1, std::multiplies<std::int64_t>());

  // Elements don't change so just a reshape.
  if (new_elem_count == old_elem_count) {
    new_node = createReshape(graph, node->input(0), new_shape);
  } else {
    // Otherwise we are expanding the original tensor.
    new_node = createConstantInt(graph, new_shape,
                                 {static_cast<int64_t>(new_shape.size())});
    new_node = createCast(graph, new_node->output(), c10::kLong);
    new_node = createExpand(graph, {node->input(0), new_node->output()});
  }
  return new_node;
}

torch::jit::Node *flattenHandler(torch::jit::Graph *graph,
                                 torch::jit::Node *node) {
  // flatten.using_ints(Tensor(a) self, int start_dim=0, int end_dim=-1) ->
  // Tensor(a)

  std::int64_t start_dim = constantToLong(node->input(1)->node());
  std::int64_t end_dim = constantToLong(node->input(2)->node());

  c10::TensorTypePtr const self_tensor =
      node->input(0)->type()->expect<c10::TensorType>();
  c10::VaryingShape const self_dims = self_tensor->sizes();

  // Respect PyTorch negative dimensions
  if (end_dim < 0) {
    end_dim = (*self_dims.sizes()).size() + end_dim;
  }

  if (start_dim < 0) {
    start_dim = (*self_dims.sizes()).size() + start_dim;
  }

  std::vector<std::int64_t> new_shape;

  int dim = 0;
  std::int64_t flattened_dims = 1;

  // Flatten the selected dimensions.
  for (auto optional_int : *self_dims.sizes()) {
    if (dim < start_dim || dim > end_dim) {
      new_shape.push_back(*optional_int);
    } else {
      flattened_dims *= *optional_int;
    }

    if (dim == end_dim) {
      new_shape.push_back(flattened_dims);
    }

    dim++;
  }

  return createReshape(graph, node->input(0), new_shape);
}

torch::jit::Node *asStridedHandler(torch::jit::Graph * /*graph*/,
                                   torch::jit::Node * /*node*/) {
  // as_strided(Tensor(a) self, int[] size, int[] stride, int?
  // storage_offset=None) -> Tensor(a)

  // as_strided is very generic and as a result complex and expensive to handle.
  // However it is always generated as part of a decomposition so we should
  // catch whichever op is getting decomposed rather than deal with as_strided.
  ERROR(
      "InternalError: aten::as_strided should have been intercepted earlier.");
  return nullptr;
}

torch::jit::Node *reshapeHandler(torch::jit::Graph *graph,
                                 torch::jit::Node *node) {
  // aten::view(Tensor(a) self, int[] size) -> (Tensor(a))
  // aten::_unsafe_view(Tensor self, SymInt[] size) -> Tensor
  const std::vector<std::int64_t> new_shape = shapeFromTensor(node->output());

  // Reshape the tensor into that shape.
  return createReshape(graph, node->input(0), new_shape);
}

torch::jit::Node *expandAsHandler(torch::jit::Graph *graph,
                                  torch::jit::Node *node) {
  // aten::expand(Tensor self, int[] size, *, bool implicit) -> Tensor
  // aten::expand_as(Tensor self, Tensor other) -> Tensor
  torch::jit::Node *new_node;

  // Extract the type from the pytorch IR.
  c10::TensorTypePtr const self_tensor =
      node->input(0)->type()->expect<c10::TensorType>();
  c10::VaryingShape const self_dims = self_tensor->sizes();

  std::int64_t old_elem_count = 0;
  for (auto optional_int : *self_dims.sizes()) {
    old_elem_count += *optional_int;
  }

  // Extract the type from the pytorch IR.
  c10::TensorTypePtr const as_tensor =
      node->input(1)->type()->expect<c10::TensorType>();
  c10::VaryingShape const dims = as_tensor->sizes();

  // Convert that IR type into a C++ vector of ints.
  std::vector<std::int64_t> new_shape;
  std::int64_t new_elem_count = 0;

  for (auto optional_int : *dims.sizes()) {
    new_shape.push_back(*optional_int);
    new_elem_count += *optional_int;
  }

  // Elements don't change so just a reshape.
  if (new_elem_count == old_elem_count) {
    new_node = createReshape(graph, node->input(0), new_shape);
  } else {
    new_node = createConstantInt(graph, new_shape,
                                 {static_cast<int64_t>(new_shape.size())});

    new_node = createCast(graph, new_node->output(), c10::kLong);

    new_node = createExpand(graph, {node->input(0), new_node->output()});
  }
  return new_node;
}

torch::jit::Node *selectHandler(torch::jit::Graph *graph,
                                torch::jit::Node *node) {
  // aten::select(Tensor self, int dim, int index) -> Tensor

  // Note: there is also this overload which is not supported at the moment
  // aten::select(Tensor[] list, int idx) -> Tensor
  auto *input = node->input(0);
  std::int64_t dim = constantToLong(node->input(1)->node());
  const auto dims = shapeFromTensor(input);
  if (dim < 0) {
    dim += dims.size();
  }

  auto *index_node = node->input(2)->node();

  torch::jit::Node *slice_node;
  if (!isTensorConstant(index_node)) {
    // Handle dynamic index
    slice_node =
        createDynamicslice(graph, {input, index_node->output()}, {dim}, {1}, 1);
  } else {
    // Handle static index
    std::int64_t index = constantToLong(index_node);

    if (index < 0) {
      index += dims.at(dim);
    }

    slice_node = createSlice(graph, {input}, {index + 1}, {index}, {dim});
  }

  // Reshape to remove the singleton dimenson left in by slice
  const auto original_shape = shapeFromTensor(node->output());

  return createReshape(graph, slice_node->output(), original_shape);
}

torch::jit::Node *contiguousHandler(torch::jit::Graph *graph,
                                    torch::jit::Node *node) {
  // aten::contiguous(Tensor self, *, MemoryFormat
  // memory_format=contiguous_format) -> Tensor Returns a copy of the tensor but
  // in contiguous memory.
  //
  // Returns the tensor
  UNUSED(graph);
  node->output()->replaceAllUsesWith(node->input(0));
  markNodeForDeletion(node);
  return nullptr;
}

torch::jit::Node *permuteHandler(torch::jit::Graph *graph,
                                 torch::jit::Node *node) {
  // aten::permute(Tensor self, int[] dims) -> Tensor

  std::vector<std::int64_t> permutation =
      constantToLongVec(node->input(1)->node());

  c10::TensorTypePtr const as_tensor =
      node->input(0)->type()->cast<c10::TensorType>();
  c10::VaryingShape dims = as_tensor->sizes();

  std::for_each(permutation.begin(), permutation.end(), [&](std::int64_t &val) {
    if (val < 0) {
      val = *dims.size() + val;
    }
  });

  return createTranspose(graph, {node->input(0)}, permutation);
}

// Get the indices for im2col
std::vector<int64_t> getGatherIndices(int64_t orig_rows, int64_t orig_cols,
                                      int64_t kernel_size_x,
                                      int64_t kernel_size_y, int64_t dilation_x,
                                      int64_t dilation_y, int64_t padding_x,
                                      int64_t padding_y, int64_t extra_padding,
                                      int64_t stride_x, int64_t stride_y) {
  const auto spatial_rows =
      (orig_rows + 2 * padding_y - dilation_y * (kernel_size_y - 1) - 1) /
          stride_y +
      1;
  const auto spatial_cols =
      (orig_cols + 2 * padding_x - dilation_x * (kernel_size_x - 1) - 1) /
          stride_x +
      1;

  const auto spatial_row_cols_product = spatial_rows * spatial_cols;

  const auto numel = spatial_row_cols_product * kernel_size_x * kernel_size_y;

  std::vector<int64_t> indices;
  indices.reserve(numel);

  for (int64_t idx = 0; idx < numel; idx++) {
    const auto kernel_offset = idx / spatial_row_cols_product;
    const auto kernel_x_offset = (kernel_offset % kernel_size_x) * dilation_x;
    const auto kernel_y_offset = (kernel_offset / kernel_size_x) * dilation_y;

    const auto spatial_offset = idx % spatial_row_cols_product;
    const auto spatial_x_offset = (spatial_offset % spatial_cols) * stride_x;
    const auto spatial_y_offset = (spatial_offset / spatial_cols) * stride_y;

    const auto actual_x = spatial_x_offset + kernel_x_offset;
    const auto actual_y = spatial_y_offset + kernel_y_offset;

    const auto in_idx =
        actual_y * (orig_cols + 2 * padding_x + extra_padding) + actual_x;

    if (actual_x < 0 || actual_y < 0) {
      ERROR("Out of range too low");
    }

    if (actual_x < 0 || actual_y < 0 ||
        actual_x >= (orig_cols + 2 * padding_x + 10) ||
        actual_y >= (orig_rows + 2 * padding_y)) {
      ERROR("Out of range");
    }
    indices.push_back(in_idx);
  }
  return indices;
}

// Reorder the padded im2col input to permit longer slices.
// Update supplied indices in place to match: these will have longer
// consecutive sequences.
torch::jit::Node *reorderBasedOnStride(torch::jit::Graph *graph,
                                       torch::jit::Value *padded,
                                       const std::vector<int64_t> &data_shape,
                                       int64_t stride, int64_t last_dim_size,
                                       std::vector<int64_t> *indices) {
  // Reshape to allow slicing based on index modulo stride
  auto *reshaped =
      createReshape(graph, padded, {data_shape[0], data_shape[1], -1, stride});

  // Slice and concatenate to order based on module stride
  std::vector<torch::jit::Value *> stride_sliced_flattened;
  stride_sliced_flattened.reserve(stride);

  for (int64_t start = 0; start < stride; start++) {
    auto *stride_sliced =
        createSlice(graph, {reshaped->output()}, {start + 1}, {start}, {3});
    auto *stride_flattened = createReshape(graph, stride_sliced->output(),
                                           {data_shape[0], data_shape[1], -1});
    stride_sliced_flattened.push_back(stride_flattened->output());
  }

  auto *concat = createConcat(graph, stride_sliced_flattened, 2);

  // Alter the indices to match
  for (size_t idx = 0; idx < indices->size(); idx++) {
    const uint64_t old_idx = (*indices)[idx];
    (*indices)[idx] =
        (old_idx % stride) * (last_dim_size / stride) + old_idx / stride;
  }

  return concat;
}

// Convert indices to slices by accumulating consecutive indices into a single
// slice. Returns slice values as a pair (start, end).
std::vector<std::pair<int64_t, int64_t>>
indicesToSlices(const std::vector<int64_t> &indices) {
  ERROR_ON(indices.empty());

  // Represents the start and end of each slice in a pair
  std::vector<std::pair<int64_t, int64_t>> slices;

  int64_t slice_start = indices[0];
  for (auto it = indices.begin() + 1; it != indices.end(); it++) {
    auto previous = *(it - 1);
    auto current = *it;

    if (current != previous + 1) {
      slices.emplace_back(slice_start, previous + 1);
      slice_start = current;
    }
  }

  // Handle the last slice
  slices.emplace_back(slice_start, indices.back() + 1);

  return slices;
}

torch::jit::Node *im2colHandler(torch::jit::Graph *graph,
                                torch::jit::Node *node) {
  // aten::im2col(Tensor self, int[2] kernel_size, int[2] dilation,
  //              int[2] padding, int[2] stride) -> Tensor

  torch::jit::Value *data = node->input(0);
  const std::vector<int64_t> data_shape = shapeFromTensor(data);
  ERROR_ON(data_shape.size() != 4);

  const std::vector<std::int64_t> kernel_shape =
      constantToLongVec(node->input(1)->node());
  ERROR_ON(kernel_shape.size() != 2);

  const std::vector<std::int64_t> dilation =
      constantToLongVec(node->input(2)->node());
  ERROR_ON(dilation.size() != 2);

  const std::vector<std::int64_t> padding =
      constantToLongVec(node->input(3)->node());
  ERROR_ON(padding.size() != 2);

  const std::vector<std::int64_t> strides =
      constantToLongVec(node->input(4)->node());
  ERROR_ON(strides.size() != 2);

  // First zero-pad the input
  // Pytorch gives the padding as being the amount to pad in both
  // directions. Popart has two arguments for each axis, the amount to pad in
  // each direction along that axis. In the form (Axis0Left, AxisNLeft...,
  // Axis0Right, AxisNRight) where left and right refer to the direction
  // along the axis to add zeros to.
  std::vector<std::int64_t> popart_padding{0, 0, padding[0], padding[1],
                                           0, 0, padding[0], padding[1]};

  // Increase RHS padding to ensure that the number of cols divides by the
  // x stride value
  auto current_width = data_shape[3] + padding[1] * 2;
  auto extra_padding = strides[1] - (current_width % strides[1]);
  extra_padding = extra_padding % strides[1];
  popart_padding.back() += extra_padding;
  current_width += extra_padding;

  auto *padded =
      createConstantPad(graph, node->input(0), popart_padding, 0., true);
  auto padded_shape = shapeFromTensor(padded->output());

  // Get the indices as if the spatial dimensions had been flattened
  auto indices =
      getGatherIndices(data_shape[2], data_shape[3], kernel_shape[1],
                       kernel_shape[0], dilation[1], dilation[0], padding[1],
                       padding[0], extra_padding, strides[1], strides[0]);

  // Calculate the last dim size as if it was flattened
  const auto last_dim_size = current_width * (data_shape[2] + padding[0] * 2);

  // Reorder to allow fewer slices then each index became a slice
  auto *rearranged = reorderBasedOnStride(graph, padded->output(), data_shape,
                                          strides[1], last_dim_size, &indices);
  const auto slices_start_end = indicesToSlices(indices);

  // Slice and concat for the reordering
  std::vector<torch::jit::Value *> sliced;
  sliced.reserve(slices_start_end.size());
  for (auto slice_start_end : slices_start_end) {
    sliced.push_back(createSlice(graph, {rearranged->output()},
                                 {slice_start_end.second},
                                 {slice_start_end.first}, {2})
                         ->output());
  }

  auto *concat = createConcat(graph, sliced, 2);

  // Finally reshape to match PyTorch's expectation
  return createReshape(
      graph, concat->output(),
      {data_shape[0], data_shape[1] * kernel_shape[0] * kernel_shape[1], -1});
}
// Make the scatter reduces indices for col2im
at::Tensor getScatterReduceIndices(int64_t num_cols, int64_t orig_rows,
                                   int64_t orig_cols, int64_t kernel_size_x,
                                   int64_t kernel_size_y, int64_t dilation_x,
                                   int64_t dilation_y, int64_t padding_x,
                                   int64_t padding_y, int64_t stride_x,
                                   int64_t stride_y) {
  // Add unity dimensions for batch and channel to facilitate tiling later
  auto indices = at::empty({1, 1, num_cols},
                           at::dtype(at::ScalarType::Int)
                               .memory_format(c10::MemoryFormat::Contiguous));

  auto *indices_ptr = indices.data_ptr<std::int32_t>();

  // The last dim has a mix of all kernel and spatial positions. Calculate
  // the number of spatial columns.
  const auto spatial_cols =
      ((orig_cols + 2 * padding_x - dilation_x * (kernel_size_x - 1) - 1) /
       stride_x) +
      1;

  // spatial_rows*spatial_cols
  // (a short cut compared to calculating spatial_rows using the equivalent
  // expression used for spatial_cols)
  const auto spatial_row_cols_product =
      num_cols / (kernel_size_x * kernel_size_y);

  // Find the original co-ordinate (x, y) from which the value in col_idx was
  // copied and calculate what the index would be
  for (int64_t col_idx = 0; col_idx < num_cols; col_idx++) {
    const auto kernel_offset = col_idx / spatial_row_cols_product;
    const auto kernel_x_offset = (kernel_offset % kernel_size_x) * dilation_x;
    const auto kernel_y_offset = (kernel_offset / kernel_size_x) * dilation_y;

    const auto spatial_offset = col_idx % (spatial_row_cols_product);
    const auto spatial_x_offset = (spatial_offset % spatial_cols) * stride_x;
    const auto spatial_y_offset = (spatial_offset / spatial_cols) * stride_y;

    const auto actual_x = spatial_x_offset + kernel_x_offset - padding_x;
    const auto actual_y = spatial_y_offset + kernel_y_offset - padding_y;

    auto index = actual_y * orig_cols + actual_x;

    // If out of range, use an out of range index. Poplar will skip this
    // index.
    if (actual_x < 0 || actual_y < 0 || actual_x >= orig_cols ||
        actual_y >= orig_rows) {
      index = orig_rows * orig_cols;
    }
    *indices_ptr = static_cast<int32_t>(index);
    indices_ptr++; // NOLINT
  }

  return indices;
}

torch::jit::Node *col2imHandler(torch::jit::Graph *graph,
                                torch::jit::Node *node) {
  // aten::col2im(Tensor self, int[2] output_size, int[2] kernel_size,
  //              int[2] dilation, int[2] padding, int[2] stride) -> Tensor

  // This is somewhat of an inverse to im2col:
  // col2im(im2col(input)) == divisor * input with divisor as a tensor
  // im2col and col2im were used to speed up convolutions via GEMM.

  torch::jit::Value *data = node->input(0);
  std::vector<int64_t> data_shape = shapeFromTensor(data);
  ERROR_ON(data_shape.size() != 3 && data_shape.size() != 2);

  const std::vector<std::int64_t> output_size =
      constantToLongVec(node->input(1)->node());
  ERROR_ON(output_size.size() != 2);

  const std::vector<std::int64_t> kernel_shape =
      constantToLongVec(node->input(2)->node());
  ERROR_ON(kernel_shape.size() != 2);

  const std::vector<std::int64_t> dilation =
      constantToLongVec(node->input(3)->node());
  ERROR_ON(dilation.size() != 2);

  const std::vector<std::int64_t> padding =
      constantToLongVec(node->input(4)->node());
  ERROR_ON(padding.size() != 2);

  const std::vector<std::int64_t> stride =
      constantToLongVec(node->input(5)->node());
  ERROR_ON(stride.size() != 2);

  // We can be given an unbatched input, with one less dimension -- just give it
  // a dummy batch dim, to unify later processing.
  const bool unbatched_input = data_shape.size() == 2;
  if (unbatched_input) {
    data = createUnsqueeze(graph, {data}, {0})->output();
    data_shape.insert(data_shape.begin(), 1);
  }

  // The batch and original channel ordering is unaffected by im2col so we can
  // reshape to factor them out.
  const auto out_channels = data_shape[1] / (kernel_shape[0] * kernel_shape[1]);
  const auto num_cols = data_shape[2] * (kernel_shape[0] * kernel_shape[1]);
  auto *reshaped =
      createReshape(graph, data, {data_shape[0], out_channels, num_cols});

  // Use scatter reduce to add across the relevent positions
  const auto indices = getScatterReduceIndices(
      num_cols, output_size[0], output_size[1], kernel_shape[1],
      kernel_shape[0], dilation[1], dilation[0], padding[1], padding[0],
      stride[1], stride[0]);
  auto *indices_const = tensorToConstant(graph, indices);

  // The indices are shape (1, 1, num_cols) but need to be tiled for the
  // scatterreduce
  const auto repeats =
      at::ones({3}, at::dtype(at::ScalarType::Long)
                        .memory_format(c10::MemoryFormat::Contiguous));
  repeats[0] = data_shape[0];
  repeats[1] = out_channels;
  auto *repeats_const = tensorToConstant(graph, repeats);
  auto *indices_tiled =
      createTile(graph, {indices_const->output(), repeats_const->output()});

  static constexpr bool enable_index_broadcast = true;
  static constexpr std::int32_t sum_reduce =
      static_cast<std::int32_t>(ScatterReduction::Sum);
  static constexpr std::int32_t axis = 2;
  auto *scatter_reduced =
      createScatterreduce(graph, {reshaped->output(), indices_tiled->output()},
                          output_size[0] * output_size[1], axis,
                          enable_index_broadcast, sum_reduce);

  auto *res = createReshape(
      graph, scatter_reduced->output(),
      {data_shape[0], out_channels, output_size[0], output_size[1]});

  // If our input was unbatched, remove the dummy batch dim we added earlier.
  if (unbatched_input) {
    res = createSqueeze(graph, {res->output()}, {0});
  }

  return res;
}

torch::jit::Node *transposeHandler(torch::jit::Graph *graph,
                                   torch::jit::Node *node) {
  // aten::transpose(Tensor self, int dim0, int dim1) -> Tensor
  std::int64_t dim0 = constantToLong(node->input(1)->node());

  std::int64_t dim1 = constantToLong(node->input(2)->node());

  c10::TensorTypePtr const as_tensor =
      node->input(0)->type()->cast<c10::TensorType>();
  c10::VaryingShape const dims = as_tensor->sizes();

  // Convert that IR type into a C++ vector of ints. In popart the
  // permutation includes all elements (rotate last two elements with [0, 1,
  // 3, 2]) whereas in pytorch you only need to specify the dimensions being
  // moved (same operation, [3, 2]). So we need to make sure the IR reflects
  // that.
  std::vector<std::int64_t> permutation;
  c10::optional<std::size_t> size = dims.size();
  ERROR_ON_MSG(!size, std::string("Number of dimensions for tensor %") +
                          node->input(0)->debugName() + " is undefined. " +
                          "About to read uninitialized memory," +
                          " unexpected behaviour happened before transpose.");
  for (std::uint64_t i = 0; i < *size; ++i) {
    permutation.push_back(i);
  }

  // Allow for python array style access.
  if (dim0 < 0) {
    dim0 = *size + dim0;
  }

  if (dim1 < 0) {
    dim1 = *size + dim1;
  }

  permutation[dim0] = dim1;
  permutation[dim1] = dim0;

  return createTranspose(graph, {node->input(0)}, permutation);
}

torch::jit::Node *numpyTHandler(torch::jit::Graph *graph,
                                torch::jit::Node *node) {
  const auto shape = shapeFromTensor(node->input(0));

  if (shape.size() < 2) {
    return node->input(0)->node();
  }

  std::vector<std::int64_t> permutation;
  for (std::int64_t i = shape.size() - 1; i >= 0; i--) {
    permutation.push_back(i);
  }

  return createTranspose(graph, {node->input(0)}, permutation);
}

torch::jit::Node *splitChunkHandler(torch::jit::Graph *graph,
                                    torch::jit::Node *node) {
  // aten::split(Tensor self, int[] split_sizes, int dim=0) -> Tensor[]"
  // aten::split(Tensor self, int split_sizes, int dim=0) -> Tensor[]"
  // aten::chunk(Tensor self, int chunks, int dim) -> Tensor[]
  // aten::unsafe_chunk(Tensor self, int chunks, int dim) -> Tensor[]

  torch::jit::Symbol const kind = node->kind();
  // Get the shape of the input.
  c10::TensorTypePtr const as_tensor =
      node->input(0)->type()->expect<c10::TensorType>();
  c10::VaryingShape const dims = as_tensor->sizes();

  // Pythonic axis translation.
  const std::int64_t dim = constantToLong(node->input(2)->node());
  const std::int64_t axis = dim >= 0 ? dim : *dims.size() + dim;

  // Size of each split ignoring the remainder at the end.
  std::vector<std::int64_t> size_of_each_split;

  // Split size can either be the number of splits or the size of the
  // splits.
  std::optional<std::int64_t> split_size;

  if (node->input(1)->node()->kind() == symbols::poptorch::tensor_constant) {
    ERROR_ON(getNodeScalarType(node->input(1)) != at::ScalarType::Int);
    split_size = constantToLong(node->input(1)->node());
  }

  if (kind == c10::aten::chunk || kind == c10::aten::unsafe_chunk) {
    // Chunk takes in the *number of chunks*. Canonicalise it to *size of
    // chunks*.
    const auto chunk_dim = *dims[axis];
    ERROR_ON_MSG(!split_size.has_value(),
                 "aten::chunk/aten::unsfe_chunk expect to receive "
                 "a single split_size");
    const auto n_chunks = *split_size;

    // Integer division: (dim / n_chunks) with rounding up
    std::int64_t const slice_size = (chunk_dim + n_chunks - 1) / n_chunks;
    auto remaining_size = chunk_dim;
    while (remaining_size >= slice_size) {
      size_of_each_split.push_back(slice_size);
      remaining_size -= slice_size;
    }
    // If we can't divide into equal chunks, then divide such that all but
    // the last chunk are the same size, and the last chunk is smaller.
    // If such a division is not possible, then return one fewer
    // chunks than specified
    if (remaining_size > 0) {
      // Add an extra slice for the remainder.
      size_of_each_split.push_back(remaining_size);
    }
  } else if (split_size) {
    // Split takes in the size of each chunk.
    std::int64_t const slice_size = *split_size;
    for (int i = 0; i < *dims[axis] / slice_size; ++i) {
      size_of_each_split.push_back(slice_size);
    }

    // Add an extra slice for the remainder.
    if (*dims[axis] % *split_size != 0) {
      size_of_each_split.push_back(*dims[axis] % *split_size);
    }
  } else {
    size_of_each_split = constantToLongVec(node->input(1)->node());
  }

  // Rolling index to track where we are in the tensor.
  std::int64_t index = 0;

  // The result of each slice.
  std::vector<torch::jit::Value *> slices;

  // Slice up according to the canonicalised split vector.
  for (std::int64_t const slice_size : size_of_each_split) {
    torch::jit::Node *slice = createSlice(
        graph, {node->input(0)}, {index + slice_size}, {index}, {axis});

    // Add the slice to the graph.
    slices.push_back(slice->output());

    // Move along in the vector dimension.
    index += slice_size;
  }

  auto *list_node = createAndInsertNode(graph, at::prim::ListConstruct, slices);
  ERROR_ON(node->output()->uses().size() != 1);
  auto *unpack = node->output()->uses()[0].user;
  ERROR_ON(unpack->kind() != c10::prim::ListUnpack);
  ERROR_ON(slices.size() != unpack->outputs().size());
  std::vector<int64_t> v;
  for (std::uint64_t i = 0; i < *dims.size(); ++i) {
    v.push_back(*dims[i]);
  }
  // Propagate types
  for (size_t i = 0; i < slices.size(); i++) {
    v[axis] = size_of_each_split[i];
    const auto type =
        slices[i]->type()->expect<c10::TensorType>()->withSizes(v);
    unpack->output(i)->setType(type);
  }

  return list_node;
}

torch::jit::Node *toHandler(torch::jit::Graph *graph, torch::jit::Node *node) {
  auto tensor_type = node->input(0)->type()->cast<c10::TensorType>();
  ERROR_ON_MSG(!tensor_type,
               "Casting from a non-tensor type not supported, in an aten::to.");

  // aten::to(Tensor(a) self, Device? device, int? dtype=None, bool
  // non_blocking=False, bool copy=False) -> Tensor(a|b)" aten::to(Tensor(a)
  // self, int? dtype=None, bool non_blocking=False, bool copy=False) ->
  // Tensor(a|b)" aten::to(Tensor(a) self, [args without dtype])

  std::optional<c10::ScalarType> cast_to;
  if (node->input(1)->type()->cast<c10::DeviceObjType>() ||
      node->input(1)->type()->cast<c10::TensorType>()) {
    cast_to = getNodeScalarType(node->output(0));
  }

  if (cast_to.has_value()) {
    // Avoid promoting to an unsupported type
    cast_to = coerceToSupportedType(*cast_to);
  }

  if (!cast_to.has_value() || cast_to == *tensor_type->scalarType()) {
    if (cast_to.has_value() && cast_to == *tensor_type->scalarType()) {
      logging::trace("Ignoring type cast to same type, {}, {}", cast_to.value(),
                     *tensor_type->scalarType());
    }

    node->output()->replaceAllUsesWith(node->input(0));
    markNodeForDeletion(node);
    return nullptr;
  }
  return createCast(graph, node->input(0), *cast_to);
}

torch::jit::Node *upsampleHandler(torch::jit::Graph *graph,
                                  torch::jit::Node *node) {
  // aten::upsample_nearest1d(Tensor self, int[] output_size, float? scales) ->
  // Tensor
  //
  // aten::upsample_nearest2d(Tensor self, int[] output_size, float?
  // scales_h, float? scales_w) -> Tensor
  //
  // aten::upsample_nearest3d(Tensor self, int[] output_size, float? scales_d,
  // float? scales_h, float? scales_w) -> Tensor

  // upsample_nearest1d.vec(Tensor input, int[]? output_size,
  // float[]? scale_factors)
  //
  // upsample_nearest2d.vec(Tensor input, int[]? output_size,
  // float[]? scale_factors) -> Tensor
  //
  // upsample_nearest3d.vec(Tensor input, int[]? output_size,
  // float[]? scale_factors) -> Tensor

  // upsample_bicubic2d(Tensor self, int[2] output_size, bool align_corners,
  // float? scales_h=None, float? scales_w=None) -> Tensor

  // upsample_bicubic2d.vec(Tensor input, int[]? output_size,
  // bool align_corners, float[]? scale_factors) -> Tensor
  //
  // Not supported by Popart yet:
  //
  // aten::upsample_linear1d(Tensor self, int[] output_size, bool align_corners,
  // float? scales) -> Tensor
  //
  // aten::upsample_trilinear3d(Tensor self, int[] output_size, bool
  // align_corners, float? scales_d, float? scales_h, float? scales_w) -> Tensor

  const auto num_inputs = node->inputs().size();
  torch::jit::Value *input = node->input(0);
  torch::jit::Value *output_size = node->input(1);
  const bool is_bicubic = node->kind() == c10::aten::upsample_bicubic2d;
  size_t scales_idx = 2;

  if (is_bicubic) {
    const auto align_corners = constantToBool(node->input(2)->node());
    ERROR_ON_MSG(align_corners, "Only support align_corners=False.");
    scales_idx++;
  }

  const auto output_rank = shapeFromTensor(node->output()).size();
  const auto input_shape = shapeFromTensor(input);
  const auto input_rank = input_shape.size();

  ERROR_ON_MSG(output_rank != input_rank,
               "Input / output rank mismatch: " << input_rank
                                                << " != " << output_rank);

  // Omit the leading batch and channel dims for computing the scale
  std::vector<double> scales{1.0, 1.0};

  if (num_inputs > scales_idx) {
    torch::jit::Value *scale1 = node->input(scales_idx);
    // Handling individual constants?
    if (isTensorConstant(scale1->node())) {
      for (size_t i = 0; i < input_rank - 2; i++) {
        scales.push_back(constantToFloat(node->input(scales_idx + i)->node()));
      }
    } else {
      // Otherwise it's upsample_bicubic2d.vec, just copy the vector of scales
      const auto scale_list = handleTensorList(scale1->node());
      if (!scale_list.empty()) {
        for (auto *s : scale_list) {
          scales.push_back(constantToFloat(s->node()));
        }
      }
    }
  }
  if (scales.size() == 2) {
    const auto output_shape = handleTensorList(output_size->node());
    for (size_t dim = 2; dim < input_rank; ++dim) {
      scales.push_back(constantToFloat(output_shape[dim - 2]->node()) /
                       input_shape[dim]);
    }
  }

  torch::jit::Node *scales_node = createConstantFloatLike(
      graph, input, scales, {static_cast<std::int64_t>(scales.size())});

  torch::jit::Node *roi_node =
      createConstantFloat32(graph, std::vector<double>(input_rank, 0.0f),
                            {static_cast<int64_t>(input_rank)});

  const std::string resize_type = is_bicubic ? "cubic" : "nearest";

  static constexpr const char *coordinate_transformation_mode = "half_pixel";
  static constexpr float cubic_coeff_a = -0.75f;
  static constexpr int64_t exclude_outside = 0;
  static constexpr float extrapolation_value = 0.0f;
  static constexpr const char *nearest_mode = "pytorch";

  return createResize(graph, {input, roi_node->output(), scales_node->output()},
                      coordinate_transformation_mode, cubic_coeff_a,
                      exclude_outside, extrapolation_value, resize_type,
                      nearest_mode);
}

torch::jit::Node *upsampleBilinear2dHandler(torch::jit::Graph *graph,
                                            torch::jit::Node *node) {
  auto *input = node->input(0);
  auto *output_size = node->input(1);
  auto *output_scale = node->input(3);

  const bool align_corners = constantToBool(node->input(2)->node());
  const auto scalar_type = getNodeScalarType(input);
  const auto output_rank = shapeFromTensor(node->output()).size();
  const auto input_shape = shapeFromTensor(input);
  const auto input_rank = input_shape.size();

  ERROR_ON_MSG(output_rank != input_rank,
               "Input / output rank mismatch: " << input_rank
                                                << " != " << output_rank);

  // Omit the leading batch and channel dims for computing the scale
  std::vector<double> scales{1.0, 1.0};

  if (!isNone(output_size)) {
    const auto output_shape = constantToLongVec(output_size->node());
    for (size_t dim = 2; dim < input_rank; ++dim) {
      scales.push_back(static_cast<double>(output_shape[dim - 2]) /
                       input_shape[dim]);
    }
  } else {
    const auto scalesxy = constantToFloatVec(output_scale->node());

    ERROR_ON_MSG(scalesxy[0] != scalesxy[1],
                 "Non-uniform bilinear upsampling not supported");
    ERROR_ON_MSG(scalesxy[0] != floor(scalesxy[0]),
                 "Bilinear upsampling with non-integer factor not supported");

    scales.push_back(scalesxy[0]);
    scales.push_back(scalesxy[1]);
  }

  const std::vector<torch::jit::Value *> inputs = {input};
  std::string const name = "UpsampleBilinear2d";
  std::string const domain = "poptorch.custom_ops";
  std::string const attributes(
      "{\"scaling_factor\":" + std::to_string(scales[2]) + ", " +
      "\"align_corners\":" + std::to_string(static_cast<int>(align_corners)) +
      "}");

  auto *new_node =
      createCustomOperation(graph, inputs, name, domain, 1, 1, attributes);
  new_node->output(0)->setType(c10::TensorType::create(
      scalar_type, c10::nullopt, c10::nullopt, c10::nullopt));
  return new_node;
}

torch::jit::Node *unsupportedUpsampleHandler(torch::jit::Graph *graph,
                                             torch::jit::Node *node) {
  UNUSED(graph);
  ERROR("Unsupported upsample mode "
        << node->kind().toQualString()
        << ": currently only 'nearest' is supported");
  return nullptr;
}

torch::jit::Node *stackHandler(torch::jit::Graph *graph,
                               torch::jit::Node *node) {
  const std::int64_t dim = constantToLong(node->input(1)->node());

  const std::vector<torch::jit::Value *> values =
      handleTensorList(node->input(0)->node());

  std::vector<torch::jit::Value *> transformed_tensors;

  transformed_tensors.reserve(values.size());
  for (auto *value : values) {
    transformed_tensors.push_back(
        createUnsqueeze(graph, {value}, {dim})->output());
  }

  return createConcat(graph, transformed_tensors, dim);
}

torch::jit::Node *intHandler(torch::jit::Graph *graph, torch::jit::Node *node) {
  return createCast(graph, node->input(0), at::ScalarType::Int);
}

} // namespace

__attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() {
  registerHandler(c10::aten::broadcast_to, expandHandler);
  registerHandler(c10::aten::expand, expandHandler);
  registerHandler(c10::aten::expand_as, expandAsHandler);
  registerHandler(c10::aten::view, reshapeHandler);
  registerHandler(c10::aten::_unsafe_view, reshapeHandler);
  registerHandler(c10::aten::unsqueeze, reshapeHandler);
  registerHandler(c10::aten::flatten, flattenHandler);
  registerHandler(c10::aten::reshape, reshapeHandler);
  registerHandler(c10::aten::_reshape_alias, reshapeHandler);
  registerHandler(c10::aten::select, selectHandler);
  registerHandler(c10::aten::split, splitChunkHandler);
  registerHandler(c10::aten::split_with_sizes, splitChunkHandler);
  registerHandler(c10::aten::chunk, splitChunkHandler);
  registerHandler(c10::aten::unsafe_chunk, splitChunkHandler);
  registerHandler(c10::aten::contiguous, contiguousHandler);
  registerHandler(c10::aten::permute, permuteHandler);
  registerHandler(c10::aten::transpose, transposeHandler);
  registerHandler(c10::aten::transpose_, transposeHandler);
  registerHandler(c10::aten::col2im, col2imHandler);
  registerHandler(c10::aten::im2col, im2colHandler);
  registerHandler(c10::aten::numpy_T, numpyTHandler);
  registerHandler(c10::aten::to, toHandler);
  registerHandler(c10::aten::type_as, toHandler);
  registerHandler(c10::aten::upsample_nearest1d, upsampleHandler);
  registerHandler(c10::aten::upsample_nearest2d, upsampleHandler);
  registerHandler(c10::aten::upsample_nearest3d, upsampleHandler);
  registerHandler(c10::aten::upsample_linear1d, unsupportedUpsampleHandler);
  registerHandler(c10::aten::upsample_bilinear2d, upsampleBilinear2dHandler);
  registerHandler(c10::aten::upsample_trilinear3d, unsupportedUpsampleHandler);
  registerHandler(c10::aten::upsample_bicubic2d, upsampleHandler);
  registerHandler(c10::aten::squeeze, reshapeHandler);
  registerHandler(c10::aten::as_strided, asStridedHandler);
  registerHandler(c10::aten::stack, stackHandler);
  registerHandler(c10::aten::Int, intHandler);
}

} // namespace poptorch


================================================
FILE: poptorch/source/popart_canonicalization/ScatterReduction.cpp
================================================
// Copyright (c) 2023 Graphcore Ltd. All rights reserved.

#include "ScatterReduction.hpp"
#include "PopartCanonicalizationUtils.hpp"

namespace poptorch {

std::int32_t getReductionMethod(torch::jit::Node *node) {
  const auto reduce = constantToString(node);
  if (reduce == "sum" || reduce == "add") {
    return static_cast<std::int32_t>(ScatterReduction::Sum);
  }
  if (reduce == "amax") {
    return static_cast<std::int32_t>(ScatterReduction::Max);
  }
  if (reduce == "amin") {
    return static_cast<std::int32_t>(ScatterReduction::Min);
  }
  if (reduce == "mean") {
    return static_cast<std::int32_t>(ScatterReduction::Mean);
  }
  if (reduce == "prod" || reduce == "multiply") {
    return static_cast<std::int32_t>(ScatterReduction::Mul);
  }

  ERROR("Unsupported reduction type for scatter_reduce: " << reduce);
}

float getReductionInitValue(std::int32_t reduce) {
  float init_val;
  switch (reduce) {
  case static_cast<std::int32_t>(ScatterReduction::Sum):
  case static_cast<std::int32_t>(ScatterReduction::Mean):
    init_val = 0.0;
    break;
  case static_cast<std::int32_t>(ScatterReduction::Mul):
    init_val = 1.0;
    break;
  case static_cast<std::int32_t>(ScatterReduction::Max):
    init_val = -std::numeric_limits<float>::infinity();
    break;
  case static_cast<std::int32_t>(ScatterReduction::Min):
    init_val = std::numeric_limits<float>::infinity();
    break;
  default:
    ERROR("Unsupported reduction type for scatter_reduce: " << reduce);
    break;
  }
  return init_val;
}

} // namespace poptorch


================================================
FILE: poptorch/source/popart_canonicalization/ScatterReduction.hpp
================================================
// Copyright (c) 2023 Graphcore Ltd. All rights reserved.
#ifndef SCATTER_REDUCTION_H
#define SCATTER_REDUCTION_H

#include <cstdint>

namespace torch {
namespace jit {
class Node;
} // namespace jit
} // namespace torch

namespace poptorch {

enum class ScatterReduction { Sum = 0, Max, Min, Mul, None, Mean };

std::int32_t getReductionMethod(torch::jit::Node *node);
float getReductionInitValue(std::int32_t reduce);

} // namespace poptorch

#endif


================================================
FILE: poptorch/source/popart_canonicalization/SliceOps.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.

#include <torch/csrc/jit/ir/ir.h>

#include <algorithm>
#include <vector>

#include "poptorch_logging/Logging.hpp"

#include "../PoptorchStaticInit.hpp"
#include "../PoptorchSymbols.hpp"
#include "PopartCanonicalizationUtils.hpp"

#include "poptorch/DispatchTracer.hpp"
#include "poptorch/OpBuilder.hpp"
#include "poptorch/Utils.hpp"

namespace poptorch {
namespace {

const char *fail_msg = "The size of the sliced tensor must be a constant for "
                       "each execution of the model when running on the IPU.";

// Extract the constant used in the supplied add/subtract node and increase or
// decrease size accordingly. Negate reverses the sign.
void extractAddSubtractConstant(torch::jit::Node *node, std::int64_t *size,
                                bool negate) {
  ERROR_ON_MSG(node->kind() != symbols::popart::add &&
                   node->kind() != symbols::popart::sub,
               fail_msg);
  ERROR_ON(node->inputs().size() != 2);

  auto *constant = isAnyConstant(node->input(0)->node())
                       ? node->input(0)->node()
                       : node->input(1)->node();

  if (node->kind() == symbols::popart::sub) {
    negate = !negate;
  }

  if (isFloatingPointConstant(constant)) {
    ERROR(fail_msg << " In this case, there is a float added to the slice "
                   << "indices meaning it may change between runs.");
  }

  if (negate) {
    (*size) -= constantToLong(constant);
  } else {
    (*size) += constantToLong(constant);
  }
}

// Returns the input of a node which is not a constant, if any. Otherwise,
// returns null. Raises an error if there are more than one such input.
torch::jit::Node *getOnlyNonConstantInput(torch::jit::Node *node) {
  torch::jit::Node *only_such_input = nullptr;

  for (auto *input : node->inputs()) {
    if (!isAnyConstant(input->node())) {
      if (only_such_input != nullptr) {
        logging::trace("dynamicSliceHandler failed due to a node with multiple "
                       "non constant inputs when seeking a shared ancestor "
                       "node. Offending node: {}",
                       *node);
        ERROR(fail_msg);
      }
      only_such_input = input->node();
    }
  }

  return only_such_input;
}

// Returns true if the nodes always yield the same output.
bool nodesAlwaysSameOutput(torch::jit::Node *a, torch::jit::Node *b) {
  // Check same kind
  if (a->kind() != b->kind()) {
    return false;
  }

  // Avoid random nodes
  if (isNondeterministic(*a) || isNondeterministic(*b)) {
    return false;
  }

  // Check same inputs
  if (a->inputs().size() != b->inputs().size()) {
    return false;
  }

  const auto *a_it = a->inputs().begin();
  const auto *b_it = b->inputs().begin();
  for (; a_it != a->inputs().end(); a_it++, b_it++) {
    if (!nodesAlwaysSameOutput((*a_it)->node(), (*b_it)->node())) {
      return false;
    }
  }

  // Check same attributes
  if (a->numAttributes() != b->numAttributes()) {
    return false;
  }

  auto a_attributes_names = a->attributeNames();
  for (auto attrib_name : a_attributes_names) {
    if (!attributeEqual(a, b, attrib_name)) {
      return false;
    }
  }

  return true;
}

// Convert any inputs to the specified node which are a cast of another
// constant into a single (already cast) constant
void resolveCastConstants(torch::jit::Graph *graph, torch::jit::Node *node) {
  for (auto *input : node->inputs()) {
    // Move on if it is not a cast situation
    auto *cast_node = input->node();
    if (cast_node->kind() != symbols::popart::cast) {
      continue;
    }

    auto *constant_to_be_cast = cast_node->input()->node();

    if (constant_to_be_cast->kind() != symbols::poptorch::tensor_constant) {
      continue;
    }

    // Obtain the tensor and cast
    auto tensor = getNodeTensorAttrValue(constant_to_be_cast);
    auto popart_cast_to = cast_node->s(c10::Symbol::attr("to"));
    auto scalar_type = onnxStrToScalarType(popart_cast_to.c_str());
    tensor.to(scalar_type);

    // Replace node to avoid a cast
    torch::jit::WithInsertPoint insert_point(node);
    auto *replacement_node = tensorToConstant(graph, tensor);
    cast_node->output()->replaceAllUsesWith(replacement_node->output());

    markNodeForDeletion(cast_node);
    markNodeForDeletion(constant_to_be_cast);
  }
}

// Follow the inputs of each node until we reach a common ancestor.
// Every node in the chain must only have one non-constant input for this to
// work. If this were not the case, the setup is unlikely to resolve to a case
// in which dynamic slice could work (exceptions include adding an input
// multiplied by zero, etc). Therefore, this limitation is not an issue in
// practice.
void populateAncestory(torch::jit::Graph *graph,
                       std::vector<torch::jit::Node *> *start_ancestory,
                       std::vector<torch::jit::Node *> *end_ancestory,
                       torch::jit::Node *start_node,
                       torch::jit::Node *end_node) {
  torch::jit::Node *start_ancestor = start_node;
  torch::jit::Node *end_ancestor = end_node;

  while (start_ancestor != end_ancestor) {
    // Push back whichever node is later
    bool end_is_later = end_ancestor->isAfter(start_ancestor);

    auto **later_node = end_is_later ? &end_ancestor : &start_ancestor;
    auto *add_to_list = end_is_later ? end_ancestory : start_ancestory;
    add_to_list->push_back(*later_node);

    // The algorithm will fail if there is an input that would be a constant but
    // for a cast. The best solution is to cast the constant to elimate the
    // cast.
    resolveCastConstants(graph, *later_node);

    // Update either start_ancestor or end_ancestor by going a step along the
    // chain of non-constant inputs
    *later_node = getOnlyNonConstantInput(*later_node);

    if (*later_node == nullptr) {
      logging::trace("dynamicSliceHandler failed due to lack of a shared "
                     "ancestor.");
      ERROR(fail_msg);
    }
  }

  // Do a sanity check and log the results to a trace
  ERROR_ON(start_ancestor == nullptr);
  logging::trace("Shared ancestor: {}\n", *start_ancestor);
  logging::trace("Start ancestory:");
  for (auto it = start_ancestory->rbegin(); it != start_ancestory->rend();
       it++) {
    logging::trace("{}", **it);
  }
  logging::trace("End ancestory:");
  for (auto it = end_ancestory->rbegin(); it != end_ancestory->rend(); it++) {
    logging::trace("{}", **it);
  }
}

// Remove nodes which are common across the start of both node ancestries
void removeCommonNodes(std::vector<torch::jit::Node *> *start_ancestory,
                       std::vector<torch::jit::Node *> *end_ancestory) {
  while (!(start_ancestory->empty() || end_ancestory->empty())) {
    if (nodesAlwaysSameOutput(start_ancestory->back(), end_ancestory->back())) {
      start_ancestory->pop_back();
      end_ancestory->pop_back();
    } else {
      break;
    }
  }

  if (start_ancestory->empty() && end_ancestory->empty()) {
    ERROR("The start and end of a slice must be different.");
  }
}

// Obtain the size of the slice based on the processed start/end ancestory.
// This involves processing add and subtract nodes and their constants.
std::int64_t
determineSizeConstant(const std::vector<torch::jit::Node *> &start_ancestory,
                      const std::vector<torch::jit::Node *> &end_ancestory) {
  std::int64_t size = 0;

  for (auto *node : start_ancestory) {
    if (node->kind() == c10::aten::Int ||
        node->kind() == symbols::popart::cast) {
      continue;
    }

    extractAddSubtractConstant(node, &size, true);
  }

  for (auto *node : end_ancestory) {
    if (node->kind() == c10::aten::Int ||
        node->kind() == symbols::popart::cast) {
      continue;
    }

    extractAddSubtractConstant(node, &size, false);
  }

  logging::trace("Size determined to be: {}", size);
  return size;
}

std::int64_t inferDynamicSliceSize(torch::jit::Graph *graph,
                                   torch::jit::Node *start_node,
                                   torch::jit::Node *end_node) {
  std::vector<torch::jit::Node *> start_ancestory;
  std::vector<torch::jit::Node *> end_ancestory;

  // Obtain the path from the nodes back to a common node
  populateAncestory(graph, &start_ancestory, &end_ancestory, start_node,
                    end_node);

  // Remove any common nodes at the beginning of each ancestory
  // NB this is used in finding the size of the slice only and does not affect
  // the start node.
  removeCommonNodes(&start_ancestory, &end_ancestory);

  // Calculate the size of the slice
  std::int64_t size = determineSizeConstant(start_ancestory, end_ancestory);

  // The == 0 case should be taken care of already but having it here stops
  // lint errors for dividing by 0.
  if (size <= 0) {
    ERROR("Taking a slice of a tensor with the end less than the start is "
          "not supported.");
  }
  return size;
}

// Handle a slice in which the start is an arbitary (i.e. non constant) input
// but the slice is a fixed size
torch::jit::Node *dynamicSliceHandler(torch::jit::Graph *graph,
                                      torch::jit::Node *node,
                                      torch::jit::Node *start_node,
                                      std::size_t start_offset,
                                      std::int64_t size) {
  // The dim is as usual
  std::int64_t dim = constantToLong(node->input(1)->node());

  auto length_of_dim = shapeFromTensor(node->input(0))[dim];

  ERROR_ON_MSG(length_of_dim % size != 0,
               "The size of the slice ("
                   << size << ") must be a factor of the slicing "
                   << "dimension (" << length_of_dim << ").");

  // Make sure the start_node is a tensor not an int
  if (start_node->output(start_offset)->type()->kind() ==
      c10::TypeKind::IntType) {
    start_node = start_node->input()->node();
    start_offset = 0;
  }

  // Reshape the start node from a scalar to a one-dim and cast to UINT32
  start_node = createReshape(graph, start_node->output(start_offset), {1});
  start_node = createCast(graph, {start_node->output()}, "UINT32");

  auto *new_node = createDynamicslice(
      graph, {node->input(0), start_node->output()}, {dim}, {size},
      1); // No overlap 1 assumed
  return new_node;
}

// Handle an update slice in which the start is an arbitary (i.e. non constant)
// input but the slice is a fixed size
torch::jit::Node *
dynamicUpdateHandler(torch::jit::Graph *graph, torch::jit::Node *node,
                     torch::jit::Node *src_node, torch::jit::Node *start_node,
                     std::size_t src_offset, std::size_t start_offset,
                     std::int64_t size) {
  // The dim is as usual
  std::int64_t dim = constantToLong(node->input(2)->node());

  // Make sure the start_node is a tensor not an int
  if (start_node->output(start_offset)->type()->kind() ==
      c10::TypeKind::IntType) {
    start_node = start_node->input()->node();
    start_offset = 0;
  }

  // Reshape the start node from a scalar to a one-dim and cast to UINT32
  start_node = createReshape(graph, start_node->output(start_offset), {1});
  start_node = createCast(graph, {start_node->output()}, "UINT32");

  auto *new_node = createDynamicupdate(
      graph,
      {node->input(0), start_node->output(), src_node->output(src_offset)},
      {dim}, {size},
      1); // No overlap 1 assumed
  return new_node;
}

// implements slicing with step by subsampling a slice with unit step
torch::jit::Node *subsampleSlice(torch::jit::Graph *graph,
                                 torch::jit::Node *slice, int dims, int dim,
                                 int step) {
  if (step != 1) {
    std::vector<int64_t> strides(dims, static_cast<int64_t>(1));
    strides[dim] = step;
    slice = createSubsample(graph, {slice->output()}, strides);
  }

  return slice;
}

namespace {
torch::jit::Node *sliceCommon(torch::jit::Graph *graph, torch::jit::Node *node,
                              torch::jit::Value *input, int64_t dim,
                              torch::jit::Node *start_node,
                              std::size_t start_offset,
                              torch::jit::Node *end_node, int64_t step) {
  auto dims = shapeFromTensor(input);
  if (dim < 0) {
    dim += dims.size();
  }

  // If any of the inputs are not constants, dynamicSlice is required
  if (!isTensorConstant(start_node) || !isTensorConstant(end_node)) {
    auto size = inferDynamicSliceSize(graph, start_node, end_node);
    auto *slice =
        dynamicSliceHandler(graph, node, start_node, start_offset, size);
    return subsampleSlice(graph, slice, dims.size(), dim, step);
  }

  std::int64_t start = constantToLong(start_node);
  std::int64_t end = constantToLong(end_node);

  // If we slice a scalar we should do nothing.
  if (dims.empty()) {
    return createIdentity(graph, {input});
  }

  // Based on aten/src/ATen/native/TensorShape.cpp slice()
  if (start < 0) {
    start += dims[dim];
  }
  if (end < 0) {
    end += dims[dim];
  }
  if (start < 0) {
    start = 0;
  } else if (start >= dims[dim]) {
    start = dims[dim];
  }
  if (end < start) {
    end = start;
  } else if (end >= dims[dim]) {
    end = dims[dim];
  }

  auto *slice = createSlice(graph, {input}, {end}, {start}, {dim});
  return subsampleSlice(graph, slice, dims.size(), dim, step);
}
} // namespace

torch::jit::Node *sliceHandler(torch::jit::Graph *graph,
                               torch::jit::Node *node) {
  // aten::slice(Tensor self, int dim, int start, int end, int step) -> Tensor
  auto *input = node->input(0);
  auto dim = constantToLong(node->input(1)->node());
  auto *start_node = node->input(2)->node();
  auto start_offset = node->input(2)->offset();
  auto *end_node = node->input(3)->node();
  auto *step_node = node->input(4)->node();

  ERROR_ON_MSG(!isTensorConstant(step_node), "Slicing step must be a constant");

  auto step = constantToLong(step_node);
  ERROR_ON_MSG(step < 1, "Slicing step must be at least 1");

  return sliceCommon(graph, node, input, dim, start_node, start_offset,
                     end_node, step);
}

torch::jit::Node *ptDynamicSliceHandler(torch::jit::Graph *graph,
                                        torch::jit::Node *node) {
  // poptorch::dynamic_slice(Tensor self, int dim, Tensor start, int size)
  // -> Tensor
  auto *input = node->input(0);
  auto dim = constantToLong(node->input(1)->node());
  auto *start_node = node->input(2)->node();
  auto start_offset = node->input(2)->offset();
  auto *size_node = node->input(3)->node();
  auto *step_node = node->input(4)->node();

  ERROR_ON_MSG(!isTensorConstant(size_node), "Slicing size must be a constant");

  auto size = constantToLong(size_node);
  ERROR_ON_MSG(size == 0, "The start and end of a slice must be different.");
  ERROR_ON_MSG(size < 0, "Taking a slice of a tensor with the end less than "
                         "the start is not supported.");

  ERROR_ON_MSG(!isTensorConstant(step_node), "Slicing step must be a constant");

  auto step = constantToLong(step_node);
  ERROR_ON_MSG(step < 1, "Slicing step must be at least 1");

  auto dims = shapeFromTensor(input);

  auto *slice =
      dynamicSliceHandler(graph, node, start_node, start_offset, size);
  return subsampleSlice(graph, slice, dims.size(), dim, step);
}

torch::jit::Node *ptDynamicUpdateHandler(torch::jit::Graph *graph,
                                         torch::jit::Node *node) {
  // poptorch::dynamic_update(Tensor self, Tensor src, int dim, Tensor start,
  // int size, int step) -> Tensor
  auto *src_node = node->input(1)->node();
  auto src_offset = node->input(1)->offset();
  auto *start_node = node->input(3)->node();
  auto start_offset = node->input(3)->offset();
  auto *size_node = node->input(4)->node();

  ERROR_ON_MSG(!isTensorConstant(size_node), "Slicing size must be a constant");

  auto size = constantToLong(size_node);
  ERROR_ON_MSG(size == 0, "The start and end of a slice must be different.");
  ERROR_ON_MSG(size < 0, "Taking a slice of a tensor with the end less than "
                         "the start is not supported.");

  auto *out = dynamicUpdateHandler(graph, node, src_node, start_node,
                                   src_offset, start_offset, size);
  return out;
}

torch::jit::Node *unbindHandler(torch::jit::Graph *graph,
                                torch::jit::Node *node) {
  // aten::unbind(Tensor self, int dim) -> Tensor[]

  auto *x = node->input(0);
  auto shape = shapeFromTensor(x);
  int dim = constantToInt(node->input(1)->node());
  std::int64_t dim_size = shape[dim];

  std::vector<torch::jit::Value *> tensors;
  // Select each index in dimension 'dim' of x and add all
  // slices to a vector
  for (std::int64_t i = 0; i < dim_size; i++) {
    auto *inds = wrapInConstant1D(graph, i);
    auto *gather = createGather(graph, {x, inds}, dim);
    // Squeeze out the gathered dim
    auto *squeeze = createSqueeze(graph, {gather->output()}, {dim});
    tensors.push_back(squeeze->output());
  }

  return createAndInsertNode(graph, at::prim::ListConstruct, tensors);
}

torch::jit::Node *narrowHandler(torch::jit::Graph *graph,
                                torch::jit::Node *node) {
  // narrow(Tensor(a) self, int dim, int start, int length) -> Tensor(a)
  auto *input = node->input(0);
  int dim = constantToInt(node->input(1)->node());
  auto *start_node = node->input(2)->node();
  auto start_offset = node->input(2)->offset();
  auto *end_node = node->input(3)->node();

  return sliceCommon(graph, node, input, dim, start_node, start_offset,
                     end_node, 1);
}

torch::jit::Node *unfoldHandler(torch::jit::Graph *graph,
                                torch::jit::Node *node) {
  auto *input = node->input(0);
  const auto input_type = input->type()->expect<c10::TensorType>();

  const auto dimension = handleDimensionParam(node->input(1), input_type);
  const auto size = constantToInt(node->input(2)->node());
  const auto step = constantToInt(node->input(3)->node());

  return createUnfold(graph, input, dimension, size, step);
}

} // namespace

__attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() {
  registerHandler(c10::aten::slice, sliceHandler);
  registerHandler(symbols::poptorch::dynamic_slice, ptDynamicSliceHandler);
  registerHandler(symbols::poptorch::dynamic_update, ptDynamicUpdateHandler);
  registerHandler(c10::aten::unbind, unbindHandler);
  registerHandler(c10::aten::narrow, narrowHandler);
  registerHandler(c10::aten::unfold, unfoldHandler);
}

} // namespace poptorch


================================================
FILE: poptorch/source/popart_canonicalization/SoftmaxOps.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#include "../PoptorchStaticInit.hpp"
#include "PopartCanonicalizationUtils.hpp"

#include "poptorch/OpBuilder.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

namespace poptorch {
namespace {
template <typename SoftmaxFunc>
torch::jit::Node *handleSoftmaxOp(torch::jit::Graph *graph,
                                  torch::jit::Node *node,
                                  SoftmaxFunc &&softmax_fn) {
  std::vector<int64_t> input_shape = shapeFromTensor(node->input(0));
  int64_t rank = static_cast<int64_t>(input_shape.size());
  std::int64_t dim = constantToLong(node->input(1)->node());

  if (dim < 0) {
    dim = rank + dim;
  }

  if (rank < 2 || dim == rank - 1) {
    return softmax_fn(graph, {node->input(0)}, dim);
  }

  // ONNX (log)softmax up to version 13 specifies that the input is
  // coerced to 2D where the axis attribute demarcates the flattening dim.
  // To workaround this we:
  //
  // 1. permute the dim arg to the final dimension
  // 2. evaluate (log)softmax using last dim as the axis
  // 3. permute result back to the original dimension order.
  //
  // Opset 13 brings the ONNX spec in line with the interpretation of the dim
  // argument as implemented by torch so this may need updating when popart
  // adds support for opset 13.
  std::vector<std::int64_t> perm(rank);
  std::iota(perm.begin(), perm.end(), 0);
  std::swap(perm[dim], perm.back());
  torch::jit::Node *transpose = createTranspose(graph, {node->input(0)}, perm);
  torch::jit::Node *sm = softmax_fn(graph, {transpose->output()}, rank - 1);
  return createTranspose(graph, {sm->output()}, perm);
}

torch::jit::Node *softmaxHandler(torch::jit::Graph *graph,
                                 torch::jit::Node *node) {
  // "aten::softmax(Tensor self, int dim, int? dtype) -> Tensor"
  return handleSoftmaxOp(graph, node, createSoftmax);
}

torch::jit::Node *logSoftmaxHandler(torch::jit::Graph *graph,
                                    torch::jit::Node *node) {
  // "aten::log_softmax(Tensor self, int dim, int? dtype) -> Tensor"
  return handleSoftmaxOp(graph, node, createLogsoftmax);
}
} // namespace

__attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() {
  registerHandler(c10::aten::softmax, softmaxHandler);
  registerHandler(c10::aten::_softmax, softmaxHandler);
  registerHandler(c10::aten::log_softmax, logSoftmaxHandler);
  registerHandler(c10::aten::_log_softmax, logSoftmaxHandler);
}

} // namespace poptorch


================================================
FILE: poptorch/source/popart_canonicalization/TensorOps.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#include "../PoptorchStaticInit.hpp"
#include "PopartCanonicalizationUtils.hpp"
#include "ScatterReduction.hpp"

#include "poptorch/DispatchTracer.hpp"
#include "poptorch/OpBuilder.hpp"
#include "poptorch/PopartCanonicalization.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

#include "../PoptorchSymbols.hpp"

#include <ATen/ATen.h>

namespace poptorch {
namespace {
torch::jit::Node *sizeHandler(torch::jit::Graph *graph,
                              torch::jit::Node *node) {
  //  aten::size(Tensor input, int dim) -> int
  std::vector<std::int64_t> shape = shapeFromTensor(node->input(0));
  std::int64_t const dim = constantToLong(node->input(1)->node());
  return createConstantInt(graph, {shape[dim]}, {1});
}

torch::jit::Node *numToTensorHandler(torch::jit::Graph *graph,
                                     torch::jit::Node *node) {
  // Should be a tensor already
  ERROR_ON(node->input(0)->node()->kind() !=
           symbols::poptorch::tensor_constant);
  UNUSED(graph);
  node->output()->replaceAllUsesWith(node->input(0));
  markNodeForDeletion(node);
  return nullptr;
}

torch::jit::Node *flipHandler(torch::jit::Graph *graph,
                              torch::jit::Node *node) {
  // aten::flip(Tensor self, int[] dims) -> Tensor
  auto *input = node->input(0);
  // Use output shape because input shape might not exist
  // if the input is the result of another operation
  auto input_shape = shapeFromTensor(node->output());
  auto dims = constantToLongVec(node->input(1)->node());
  for (auto &dim : dims) {
    if (dim < 0) {
      dim += input_shape.size();
    }
  }
  return createReverse(graph, {input}, dims);
}

// Input tensor of shape [M, N, ...] is repeated in [R1, R2, ...]
// dimensions by:
//   1) transforming to [1, M, 1, N, ...]
//   2) expanding to [R1, M, R2, N, ...]
//   3) reshaping to [R1*M, R2*N, ...]
torch::jit::Node *repeatHandler(torch::jit::Graph *graph,
                                torch::jit::Node *node) {
  torch::jit::Value *input = node->input(0);
  std::vector<std::int64_t> dim_repeats =
      constantToLongVec(node->input(1)->node());
  std::vector<std::int64_t> old_shape = shapeFromTensor(input);
  const std::vector<std::int64_t> new_shape = shapeFromTensor(node->output());

  // If repeat dimensions exceed shape dimensions, pad the front of the
  // original shape with singleton dimensions so that it can
  // be expanded

  std::size_t const padding = dim_repeats.size() > old_shape.size()
                                  ? dim_repeats.size() - old_shape.size()
                                  : 0;

  std::vector<std::int64_t> dim_expands;
  std::vector<std::int64_t> transform_shape;

  for (std::size_t i = 0; i < dim_repeats.size(); i++) {
    dim_expands.push_back(dim_repeats[i]);

    std::int64_t const padded_dim = i < padding ? 1 : old_shape[i - padding];
    if (padded_dim > 1 && dim_repeats[i] > 1) {
      transform_shape.push_back(1);
      dim_expands.push_back(padded_dim);
    }
    transform_shape.push_back(padded_dim);
  }

  auto *reshape = createReshape(graph, input, transform_shape);
  auto *expand = createExpand(
      graph, {reshape->output(), intVectorToIrConstant(graph, dim_expands)});

  return createReshape(graph, expand->output(), new_shape);
}

torch::jit::Node *rollHandler(torch::jit::Graph *graph,
                              torch::jit::Node *node) {
  // aten::roll(Tensor self, int[1] shifts, int[1] dims=[]) -> Tensor
  auto *input = node->input(0);
  auto input_shape = shapeFromTensor(input);
  auto shifts = constantToLongVec(node->input(1)->node());
  auto dims = constantToLongVec(node->input(2)->node());

  bool reshape_output = false;
  if (dims.empty()) {
    // If dims not provided, a flattened version of the tensor is rolled and
    // then reshaped back.
    ERROR_ON_MSG(shifts.size() != 1,
                 "The 'shifts' argument of the roll op must be a scalar when "
                 "'dims' is not specified.");
    input = createFlatten(graph, {input}, 0)->output();
    const int64_t flattened_size = std::accumulate(
        input_shape.begin(), input_shape.end(), 1, std::multiplies<int64_t>());
    input_shape.clear();
    input_shape.push_back(1);
    input_shape.push_back(flattened_size);
    dims.push_back(1);
    reshape_output = true;
  } else {
    ERROR_ON_MSG(shifts.size() != dims.size(),
                 "The 'shifts' and 'dims' arguments of the roll op must be the "
                 "same size.");
  }

  torch::jit::Value *output = input;
  auto number_of_dims = input_shape.size();
  for (size_t i = 0; i < dims.size(); ++i) {
    auto current_dim = dims.at(i);
    // Match the torch API of requiring dim in [-len(shape), len(shape)-1]
    ERROR_ON_MSG(
        ((static_cast<std::size_t>(current_dim) >= number_of_dims) &&
         (current_dim >= 0)) ||
            ((static_cast<std::size_t>(-current_dim) > number_of_dims) &&
             (current_dim < 0)),
        "Dimension out of range at index "
            << i << " (expected to be in range of ["
            << -static_cast<std::int64_t>(number_of_dims) << ", "
            << number_of_dims - 1 << "], but got " << current_dim
            << ") in the roll op.");

    current_dim = (current_dim + number_of_dims) % number_of_dims;

    auto current_dim_size = input_shape.at(current_dim);
    // Handle overreaching and negative shifts.
    auto split = (((-shifts.at(i)) % current_dim_size) + current_dim_size) %
                 current_dim_size;
    auto *chunks = createSplit(graph, {output}, 2, current_dim,
                               {split, current_dim_size - split});
    output =
        createConcat(graph, {chunks->output(1), chunks->output(0)}, current_dim)
            ->output();
  }

  if (reshape_output) {
    return createReshape(graph, output, shapeFromTensor(node->input(0)));
  }
  return output->node();
}

torch::jit::Node *cloneHandler(torch::jit::Graph *graph,
                               torch::jit::Node *node) {
  // aten::clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor

  // Identity will just create a copy
  return createIdentity(graph, {node->input(0)});
}

torch::jit::Node *copyHandler(torch::jit::Graph *graph,
                              torch::jit::Node *node) {
  // aten::copy_(Tensor self, Tensor src, bool non_blocking) -> Tensor
  auto *dest = node->input(0);
  auto *src = node->input(1);
  at::ScalarType const dest_type = getNodeScalarType(dest);
  at::ScalarType const src_type = getNodeScalarType(src);

  torch::jit::Node *copy = nullptr;

  if (src_type == dest_type) {
    copy = createIdentity(graph, {src});
  } else {
    copy = createCast(graph, src, dest_type);
  }
  ERROR_ON(copy == nullptr);

  copy->output()->setType(
      copy->output()->type()->expect<c10::TensorType>()->withRequiresGrad(
          src->type()->expect<c10::TensorType>()->requiresGrad()));

  return copy;
}

torch::jit::Node *justReturnFalse(torch::jit::Graph *graph,
                                  torch::jit::Node * /*unused*/) {
  c10::IValue const value{false};
  torch::jit::Value *val = insertConstant(graph, value);
  return val->node();
}

torch::jit::Node *linearHandler(torch::jit::Graph *graph,
                                torch::jit::Node *node) {
  // aten::linear(Tensor input, Tensor weight, Tensor? bias) -> Tensor
  auto *x = node->input(0);
  auto *w = node->input(1);
  auto *b = node->input(2);

  auto *w_t = createTranspose(graph, {w}, {1, 0});
  auto *output = createMatmul(graph, {x, w_t->output()});

  if (!isNone(b)) {
    output = createAdd(graph, {output->output(), b});
  }
  return output;
}

torch::jit::Node *gatherHandler(torch::jit::Graph *graph,
                                torch::jit::Node *node) {
  auto *input = node->input(0);
  auto tensor_type = input->type()->expect<c10::TensorType>();
  auto axis = handleDimensionParam(node->input(1), tensor_type);
  auto *indices = node->input(2);
  auto input_shape = shapeFromTensor(input);
  auto index_shape = shapeFromTensor(indices);
  auto stride = input_shape[axis];

  for (unsigned s = 0; s < input_shape.size(); ++s) {
    if (s != axis) {
      ERROR_ON(input_shape[s] < index_shape[s]);
    }
  }

  // Move gather axis to the innermost dim
  std::vector<int64_t> permutation;
  const unsigned input_num_dims = input_shape.size();
  permutation.resize(input_num_dims);
  std::iota(permutation.begin(), permutation.end(), 0);
  permutation.push_back(permutation[axis]);
  permutation.erase(permutation.begin() + axis);

  if (axis != input_num_dims - 1) {
    input = createTranspose(graph, {input}, permutation)->output();
    input_shape.push_back(input_shape[axis]);
    input_shape.erase(input_shape.begin() + axis);
  }
  // Flatten the data
  auto *flatten_input = createFlatten(graph, {input}, 0)->output();
  int64_t num_offsets = std::accumulate(index_shape.begin(), index_shape.end(),
                                        1, std::multiplies<int64_t>());
  num_offsets /= index_shape[axis];

  // Transpose the indices to make them broadcastable with offsets
  std::vector<int64_t> idx_permutation;
  idx_permutation.resize(index_shape.size());
  std::iota(idx_permutation.begin(), idx_permutation.end(), 0);
  idx_permutation.insert(idx_permutation.begin(), idx_permutation[axis]);
  idx_permutation.erase(idx_permutation.begin() + axis + 1);

  if (axis != 0) {
    indices = createTranspose(graph, {indices}, idx_permutation)->output();
    index_shape.insert(index_shape.begin(), index_shape[axis]);
    index_shape.erase(index_shape.begin() + (axis + 1));
  }
  // Create shape for offsets that is broadcastable with indices tensor
  std::vector<int64_t> offset_shape = {index_shape.begin() + 1,
                                       index_shape.end()};
  // Make the offsets
  std::vector<int64_t> offsets_val;
  int64_t num_data = std::accumulate(input_shape.begin(), input_shape.end(), 1,
                                     std::multiplies<int64_t>());
  num_data /= input_shape[input_num_dims - 1];
  torch::jit::Value *offsets;

  // Case where one or more indices dims size < data size
  if (num_offsets != num_data) {
    // Create the offsets tensor from data_size
    // then slice it to match indices_size
    auto data_shape = shapeFromTensor(node->input(0));
    data_shape.insert(data_shape.begin(), data_shape[axis]);
    data_shape.erase(data_shape.begin() + (axis + 1));
    std::vector<int64_t> temp_offsets_shape = {data_shape.begin() + 1,
                                               data_shape.end()};
    offsets_val.resize(num_data);
    std::iota(offsets_val.begin(), offsets_val.end(), 0);

    for (auto &v : offsets_val) {
      v *= stride;
    }
    offsets =
        createConstantInt(graph, offsets_val, temp_offsets_shape)->output();

    for (unsigned k = 0; k < offset_shape.size(); ++k) {
      if (offset_shape[k] != temp_offsets_shape[k]) {
        offsets = createSlice(graph, {offsets}, {offset_shape[k]}, {0}, {k})
                      ->output();
      }
    }
  } else {
    offsets_val.resize(num_offsets);
    std::iota(offsets_val.begin(), offsets_val.end(), 0);

    for (auto &v : offsets_val) {
      v *= stride;
    }
    offsets = createConstantInt(graph, offsets_val, {offset_shape})->output();
  }

  auto *new_indices = createAdd(graph, {indices, offsets})->output();
  // Gather the elements
  auto *output = createGather(graph, {flatten_input, new_indices}, 1)->output();
  // remove the dim-0 added by gather
  output = createSqueeze(graph, {output}, {0})->output();
  // transpose back to the original indices shape if needed
  if (axis != 0) {
    std::iota(idx_permutation.begin(), idx_permutation.end(), 0);
    idx_permutation.erase(idx_permutation.begin());
    idx_permutation.insert(idx_permutation.begin() + axis, 0);
    output = createTranspose(graph, {output}, idx_permutation)->output();
  }
  return output->node();
}

torch::jit::Node *takeAlongDimHandler(torch::jit::Graph *graph,
                                      torch::jit::Node *node) {
  // aten::take_along_dim(Tensor self, Tensor indices, int? dim=None) -> Tensor

  torch::jit::Value *input = node->input(0);
  torch::jit::Value *indices = node->input(1);
  torch::jit::Value *dim = node->input(2);

  const std::vector<std::int64_t> input_shape = shapeFromTensor(input);
  std::vector<std::int64_t> indices_shape = shapeFromTensor(indices);

  if (!isNone(dim)) {
    const auto dim_value = constantToLong(dim->node());

    const auto broadcast_to = [&](torch::jit::Value *value,
                                  const std::vector<std::int64_t> &shape) {
      std::vector<torch::jit::Value *> shape_values(shape.size(), nullptr);
      std::transform(shape.cbegin(), shape.cend(), shape_values.begin(),
                     [&](const auto elem) -> torch::jit::Value * {
                       return wrapInConstant1D(graph, elem);
                     });

      torch::jit::Value *shape_list =
          createAndInsertNode(graph, c10::prim::ListConstruct, shape_values)
              ->output();

      auto *broadcasted_value =
          createHandlerOperation(graph, getHandler(c10::aten::broadcast_to),
                                 {value, shape_list})
              ->output();

      broadcasted_value->setType(
          value->type()->expect<c10::TensorType>()->withSizes(shape));
      return broadcasted_value;
    };

    auto self_sizes = input_shape;
    // update number of elements at dim as per indices
    self_sizes.at(dim_value) = indices_shape.at(dim_value);
    if (auto bcast_shape = at::infer_size(self_sizes, indices_shape);
        bcast_shape != indices_shape) {
      indices = broadcast_to(indices, bcast_shape);
    }

    // update number of elements at dim as per self
    indices_shape.at(dim_value) = input_shape.at(dim_value);
    if (auto bcast_shape = at::infer_size(indices_shape, input_shape);
        bcast_shape != input_shape) {
      input = broadcast_to(input, bcast_shape);
    }
  } else {
    const auto flatten =
        [&](torch::jit::Value *value,
            const std::vector<std::int64_t> &shape) -> torch::jit::Value * {
      const auto rank = shape.size();
      if (rank == 1) {
        return value;
      }

      const int64_t num_elems =
          rank > 1 ? std::accumulate(shape.cbegin(), shape.cend(), 1,
                                     std::multiplies<int64_t>())
                   : 1;

      return createReshape(graph, value, {num_elems})->output();
    };

    input = flatten(input, input_shape);
    indices = flatten(indices, indices_shape);
    dim = wrapInConstant1D(graph, 0);
  }

  return createHandlerOperation(graph, getHandler(c10::aten::gather),
                                {input, dim, indices});
}

torch::jit::Node *scatterHandler(torch::jit::Graph *graph,
                                 torch::jit::Node *node) {
  auto *input = node->input(0);
  const auto input_type = input->type()->expect<c10::TensorType>();
  const auto dim = handleDimensionParam(node->input(1), input_type);
  auto *index = node->input(2);
  auto *src = node->input(3);

  // `scatter` can be passed a single value for `src` as a tensor constant, so
  // broadcast it up.
  if (isConstantScalar(src)) {
    auto *shape = intVectorToIrConstant(graph, shapeFromTensor(index));
    const auto input_scalar_type = *input_type->scalarType();
    if (input_scalar_type !=
        *src->type()->expect<c10::TensorType>()->scalarType()) {
      // poplibs scatter requires that `src` have the same data type as input so
      // cast it if needed
      src = castToPromoteType(graph, src, input_scalar_type);
    }
    src = createExpand(graph, {src, shape})->output();
  }

  if (node->inputs().size() < 4) {
    return createScatterElements(graph, {input, index, src}, dim);
  }

  // reduction type is optional argument
  const auto reduce = node->inputs().size() < 5
                          ? static_cast<std::int32_t>(ScatterReduction::None)
                          : getReductionMethod(node->input(4)->node());
  const auto input_shape = shapeFromTensor(input);
  const auto axis_size = input_shape.at(dim);
  static constexpr bool enable_index_broadcast = false;

  return createScatterreduce(graph, {src, index, input}, axis_size, dim,
                             enable_index_broadcast, reduce);
}

torch::jit::Node *fullCommon(torch::jit::Graph *graph, torch::jit::Value *v,
                             at::ScalarType type,
                             const std::vector<int64_t> &shape) {
  auto *vn = v->node();
  auto stype = coerceToSupportedType(type);
  if (isTensorConstant(vn) && vn->output()->type()->cast<c10::TensorType>()) {
    auto v_scalar = getNodeTensorAttrValue(vn).to(stype).item();
    return tensorToConstant(graph, at::full(shape, v_scalar, stype));
  }
  auto *v_cast = createCast(graph, v, stype)->output();
  auto *c_shape = intVectorToIrConstant(graph, shape);
  return createExpand(graph, {v_cast, c_shape});
}

torch::jit::Node *fullHandler(torch::jit::Graph *graph,
                              torch::jit::Node *node) {
  // aten::full(int[] size, Scalar fill_value,
  //            ScalarType? dtype=None, Layout? layout=None,
  //            Device? device=None, bool? pin_memory=None) -> Tensor
  // aten::new_full(Tensor self, int[] size, Scalar fill_value,
  //                ScalarType? dtype=None, Layout? layout=None,
  //                Device? device=None, bool? pin_memory=None) -> Tensor
  size_t shape_index = 0;
  if (node->kind() == c10::aten::new_full) {
    shape_index = 1;
  }
  auto *shape = node->input(shape_index + 0);
  auto *v = node->input(shape_index + 1);
  auto *dtype = node->input(shape_index + 2);
  auto lv_shape = constantToLongVec(shape->node());
  auto type = c10::ScalarType::Float;
  if (node->kind() == c10::aten::new_full) {
    type = getNodeScalarType(node->input(0));
  }
  // The specified dtype takes precedence
  if (!isNone(dtype)) {
    type = constantToScalarType(dtype->node());
  }

  return fullCommon(graph, v, type, lv_shape);
}

torch::jit::Node *fullLikeHandler(torch::jit::Graph *graph,
                                  torch::jit::Node *node) {
  // aten::full_like(Tensor self, Scalar fill_value) -> Tensor
  auto *v = node->input(1);
  auto *like = node->output(0);
  auto like_shape = shapeFromTensor(like);
  auto like_type = getNodeScalarType(like);
  return fullCommon(graph, v, like_type, like_shape);
}

torch::jit::Node *triuHandler(torch::jit::Graph *graph,
                              torch::jit::Node *node) {
  // aten::triu(Tensor self, int diagonal=0) -> Tensor
  ERROR("torch.triu is only supported within constant expressions, "
        "for example torch.ones(3, 3).triu_().");
  UNUSED(graph);
  UNUSED(node);
  return nullptr;
}

torch::jit::Node *ipuPrintTensorHandler(torch::jit::Graph *graph,
                                        torch::jit::Node *node) {
  auto *x = node->input(0);
  auto title = constantToString(node->input(1)->node());
  auto print_gradient = constantToInt(node->input(2)->node());
  auto summarise_threshold = constantToInt(node->input(3)->node());
  auto edge_items = constantToInt(node->input(4)->node());
  auto max_line_width = constantToInt(node->input(5)->node());
  auto digits = constantToInt(node->input(6)->node());
  auto float_format = constantToInt(node->input(7)->node());
  auto separator = constantToString(node->input(8)->node());
  auto open_bracket = constantToString(node->input(9)->node());
  auto close_bracket = constantToString(node->input(10)->node());
  return createPrinttensor(graph, {x}, print_gradient, title,
                           summarise_threshold, edge_items, max_line_width,
                           digits, float_format, *separator.c_str(),
                           *open_bracket.c_str(), *close_bracket.c_str());
}
} // namespace

__attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() {
  registerHandler(c10::aten::size, sizeHandler);
  registerHandler(c10::prim::NumToTensor, numToTensorHandler);
  registerHandler(c10::aten::flip, flipHandler);
  registerHandler(c10::aten::repeat, repeatHandler);
  registerHandler(c10::aten::is_complex, justReturnFalse);
  registerHandler(c10::aten::roll, rollHandler);
  registerHandler(c10::aten::clone, cloneHandler);
  registerHandler(c10::aten::copy_, copyHandler);
  registerHandler(c10::aten::linear, linearHandler);
  registerHandler(c10::aten::gather, gatherHandler);
  registerHandler(c10::aten::scatter, scatterHandler);
  registerHandler(c10::aten::full, fullHandler);
  registerHandler(c10::aten::new_full, fullHandler);
  registerHandler(c10::aten::full_like, fullLikeHandler);
  registerHandler(c10::aten::triu, triuHandler);
  registerHandler(symbols::poptorch::ipu_print_tensor, ipuPrintTensorHandler);
  registerHandler(c10::aten::take_along_dim, takeAlongDimHandler);
}

} // namespace poptorch


================================================
FILE: poptorch/source/popart_canonicalization/pyg_torch_cluster/FpsOp.cpp
================================================
// Copyright (c) 2023 Graphcore Ltd. All rights reserved.

#include <random>

#include <ATen/ExpandUtils.h>

#include "../PopartCanonicalizationUtils.hpp"
#include "../PoptorchStaticInit.hpp"
#include "../PoptorchSymbols.hpp"

#include "poptorch/OpBuilder.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"

namespace poptorch {
namespace {

torch::jit::Node *prepareMinimum(torch::jit::Graph *graph,
                                 torch::jit::Value *const lhs,
                                 torch::jit::Value *const rhs) {
  return createMin(graph, {lhs, rhs});
}

torch::jit::Node *prepareRowUpdate(torch::jit::Graph *graph,
                                   torch::jit::Value *const value,
                                   torch::jit::Value *const new_row,
                                   torch::jit::Value *const row_num,
                                   std::int64_t row_len) {
  return createDynamicupdate(graph, {value, row_num, new_row}, {0}, {row_len},
                             0);
}

torch::jit::Node *updateIdxs(torch::jit::Graph *graph,
                             torch::jit::Value *const idxs, std::int64_t offset,
                             torch::jit::Value *const new_val) {
  auto *const offset_node = createConstantLong(graph, {offset}, {1});
  return createDynamicupdate(graph, {idxs, offset_node->output(), new_val}, {0},
                             {1}, 0);
}

torch::jit::Node *prepareRowSlice(torch::jit::Graph *graph,
                                  torch::jit::Value *const value,
                                  torch::jit::Value *const row_num,
                                  std::int64_t row_len) {
  auto *const slice = createConstantFloat32(graph, {0.0}, {row_len});
  return createDynamicslice(graph, {value, row_num, slice->output()}, {0}, {1},
                            1);
}

torch::jit::Node *prepareArgmax(torch::jit::Graph *graph,
                                torch::jit::Value *const dists,
                                torch::jit::Value *const row_num,
                                std::int64_t row_len) {
  auto *const dists_row = prepareRowSlice(graph, dists, row_num, row_len);
  return createArgmax(graph, {dists_row->output()}, 0, 0l);
}

torch::jit::Node *prepareOutput(torch::jit::Graph *graph,
                                std::int64_t src_len) {
  const auto zeros = std::vector<std::int64_t>(src_len, 0l);
  return createConstantInt(graph, zeros, {src_len});
}

torch::jit::Node *prepareStartIdx(torch::jit::Graph *graph, float range_begin,
                                  float range_end, bool random_start) {
  if (random_start) {
    return createRandomUniform(graph, nullptr, {1}, range_end, range_begin,
                               c10::ScalarType::Int);
  }
  return createConstantLong(graph, {static_cast<std::int64_t>(range_begin)},
                            {1});
}

torch::jit::Node *prepareMaskedColDists(torch::jit::Graph *graph,
                                        torch::jit::Value *const dists,
                                        torch::jit::Value *const col_idx) {
  return createDynamiczero(graph, {dists, col_idx}, {1}, {1});
}

torch::jit::Node *prepareDists(torch::jit::Graph *graph,
                               torch::jit::Value *const src) {
  auto *const p = createConstantFloat32(graph, {2.0}, {1});
  return createHandlerOperation(graph, getHandler(c10::aten::cdist),
                                {src, src, p->output()});
}

torch::jit::Node *maskDists(torch::jit::Graph *graph,
                            torch::jit::Value *const dists,
                            const std::vector<std::int64_t> &offset,
                            const std::vector<std::int64_t> &sizes) {
  auto *const offset_node = createConstantInt(graph, offset, {2});
  return createDynamiczero(graph, {dists, offset_node->output()}, {0, 1},
                           sizes);
}

torch::jit::Node *prepareMaskedDists(torch::jit::Graph *graph,
                                     torch::jit::Value *const src,
                                     const std::vector<std::int64_t> &ptr) {
  auto *dists = prepareDists(graph, src);
  if (ptr.size() > 2) {
    dists = maskDists(graph, dists->output(), {0, ptr[1]},
                      {ptr[1], ptr.back() - ptr[1]});
    for (size_t i = 2; i < ptr.size() - 1; i++) {
      dists = maskDists(graph, dists->output(), {ptr[i - 1], 0},
                        {ptr[i] - ptr[i - 1], ptr[i - 1]});
      dists = maskDists(graph, dists->output(), {ptr[i - 1], ptr[i]},
                        {ptr[i] - ptr[i - 1], ptr.back() - ptr[i]});
    }
    dists = maskDists(graph, dists->output(), {ptr[ptr.size() - 2], 0},
                      {ptr.back() - ptr[ptr.size() - 2], ptr[ptr.size() - 2]});
  }
  return dists;
}

std::vector<std::int64_t> calcDeg(const std::vector<std::int64_t> &ptr,
                                  float ratio) {
  std::vector<std::int64_t> deg(ptr.size(), 0);
  for (size_t i = 1; i < ptr.size(); i++) {
    deg[i] = std::ceil(static_cast<float>(ptr[i] - ptr[i - 1]) * ratio);
    deg[i] += deg[i - 1];
  }
  return deg;
}

torch::jit::Node *fpsHandler(torch::jit::Graph *graph, torch::jit::Node *node) {
  torch::jit::Value *const src = node->input(0);

  const std::vector<std::int64_t> ptr =
      constantToLongVec(node->input(1)->node());
  const float ratio = constantToFloat(node->input(2)->node());
  const bool random_start = constantToBool(node->input(3)->node());

  const std::vector<std::int64_t> src_shape = shapeFromTensor(src);

  // 0. Prepare output tensor
  const auto deg = calcDeg(ptr, ratio);
  const auto out_len = deg.back();
  auto *idxs = prepareOutput(graph, out_len);

  // 1. Create masked dists (leave only the slices representing batches)
  auto *dists = prepareMaskedDists(graph, src, ptr);

  // 2. Iterate over batches defined in deg
  std::int64_t pos_in_idxs = 0;
  for (size_t b = 1; b < deg.size(); b++) {
    // 3. Generate start idx...
    auto *prev_idx =
        prepareStartIdx(graph, ptr[b - 1], ptr[b] - 1, random_start);

    // 4. ...and insert it into the outputs
    idxs = updateIdxs(graph, idxs->output(), pos_in_idxs++, prev_idx->output());
    if (pos_in_idxs == deg[b] || pos_in_idxs == out_len) {
      continue;
    }

    // 5. Zero out the dists column with prev_idx number
    dists = prepareMaskedColDists(graph, dists->output(), prev_idx->output());

    // 6. Get the index of the max value in the currently processed dists row
    auto *idx =
        prepareArgmax(graph, dists->output(), prev_idx->output(), src_shape[0]);
    idxs = updateIdxs(graph, idxs->output(), pos_in_idxs++, idx->output());

    while (pos_in_idxs < deg[b] && pos_in_idxs < out_len) {
      // 7. Zero out the dists column with idx number
      dists = prepareMaskedColDists(graph, dists->output(), idx->output());
      auto *const prev_row = prepareRowSlice(graph, dists->output(),
                                             prev_idx->output(), src_shape[0]);
      auto *const curr_row =
          prepareRowSlice(graph, dists->output(), idx->output(), src_shape[0]);

      // 8. Update the currently processed row with the min of the current and
      // previous row
      auto *const curr_dists_row =
          prepareMinimum(graph, prev_row->output(), curr_row->output());
      dists = prepareRowUpdate(graph, dists->output(), curr_dists_row->output(),
                               idx->output(), src_shape[0]);

      prev_idx = idx;
      // 9. Get the index of the max value in the currently processed dists row
      idx = prepareArgmax(graph, dists->output(), idx->output(), src_shape[0]);
      idxs = updateIdxs(graph, idxs->output(), pos_in_idxs++, idx->output());
    }
  }
  return idxs;
}

} // namespace

__attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() {
  registerHandler(poptorch::symbols::poptorch::fps, fpsHandler);
}

} // namespace poptorch


================================================
FILE: poptorch/source/popart_canonicalization/pyg_torch_cluster/GridOp.cpp
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
#include "../PopartCanonicalizationUtils.hpp"
#include "../PoptorchStaticInit.hpp"
#include "../PoptorchSymbols.hpp"

#include "../ScatterReduction.hpp"

#include "poptorch/OpBuilder.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

namespace poptorch {

namespace {

torch::jit::Node *gridHandler(torch::jit::Graph *graph,
                              torch::jit::Node *node) {
  auto *pos = node->input(0);
  auto *size = node->input(1);

  auto *start = node->input(2);
  auto *end = node->input(3);

  std::vector<std::int64_t> pos_shape = shapeFromTensor(pos);
  const std::vector<std::int64_t> size_shape = shapeFromTensor(size);

  int num_voxels_size = 1;

  if (pos_shape.size() > 1) {
    num_voxels_size = std::accumulate(pos_shape.cbegin() + 1, pos_shape.cend(),
                                      1, std::multiplies<int>());
    pos_shape = {pos_shape[0], num_voxels_size};
    pos = createReshape(graph, pos, pos_shape)->output();
  }

  if (isNone(start)) {
    start = createReducemin(graph, {pos}, {0}, 0)->output();
  }

  if (isNone(end)) {
    end = createReducemax(graph, {pos}, {0}, 0)->output();
  }
  pos = createSub(graph, {pos, createUnsqueeze(graph, {start}, {0})->output()})
            ->output();

  start = createCast(graph, start, c10::kFloat)->output();
  end = createCast(graph, end, c10::kFloat)->output();
  size = createCast(graph, size, c10::kFloat)->output();

  auto *ones = wrapInConstantVec(graph, {1});
  auto *zeros = wrapInConstantVec(graph, {0});

  auto *num_voxels =
      createDiv(graph, {createSub(graph, {end, start})->output(), size})
          ->output();
  num_voxels = createCast(graph, num_voxels, c10::kInt)->output();
  num_voxels =
      createAdd(graph, {num_voxels, wrapInConstantVec(graph, {1})})->output();
  num_voxels->setType(num_voxels->type()->expect<c10::TensorType>()->withSizes(
      {num_voxels_size}));
  num_voxels = createHandlerOperation(graph, getHandler(c10::aten::cumprod),
                                      {num_voxels, zeros})
                   ->output();
  num_voxels = createConcat(graph, {ones, num_voxels}, 0)->output();
  num_voxels =
      createSlice(graph, {num_voxels}, {size_shape.at(0)}, {0}, {0})->output();

  num_voxels->setType(num_voxels->type()->expect<c10::TensorType>()->withSizes(
      {size_shape.at(0)}));

  pos = createCast(graph, pos, c10::kFloat)->output();
  size =
      createReshape(graph, size,
                    {1, std::accumulate(size_shape.cbegin(), size_shape.cend(),
                                        1, std::multiplies<int>())})
          ->output();
  auto *out = createDiv(graph, {pos, size})->output();
  out = createCast(graph, out, c10::kInt)->output();
  out = createMul(graph, {out, num_voxels})->output();

  return createReducesum(graph, {out}, {1}, 0);
}

} // namespace

__attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() {
  registerHandler(torch_cluster::grid, gridHandler);
}

} // namespace poptorch


================================================
FILE: poptorch/source/popart_canonicalization/pyg_torch_cluster/NearestOp.cpp
================================================
// Copyright (c) 2023 Graphcore Ltd. All rights reserved.

#include <random>

#include <ATen/ExpandUtils.h>

#include "../PopartCanonicalizationUtils.hpp"
#include "../PoptorchStaticInit.hpp"
#include "../PoptorchSymbols.hpp"

#include "poptorch/OpBuilder.hpp"
#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"

namespace poptorch {
namespace {

std::tuple<std::int64_t, std::int64_t>
batchSizes(const torch::jit::Value *x, const torch::jit::Value *y,
           std::vector<std::int64_t> &batch_x,
           std::vector<std::int64_t> &batch_y) {
  if (!std::is_sorted(batch_x.cbegin(), batch_x.cend())) {
    throw std::invalid_argument("'batch_x' is not sorted");
  }
  if (!std::is_sorted(batch_y.cbegin(), batch_y.cend())) {
    throw std::invalid_argument("'batch_y' is not sorted");
  }

  std::int64_t batch_x_size = batch_x.size();
  std::int64_t batch_y_size = batch_y.size();

  if (batch_x_size == 0 && (batch_y_size != 0)) {
    batch_x_size = shapeFromTensor(x)[0];
    batch_x = std::vector<std::int64_t>(batch_x_size, 0);
  }

  if (batch_y_size == 0 && (batch_x_size != 0)) {
    batch_y_size = shapeFromTensor(y)[0];
    batch_y = std::vector<std::int64_t>(batch_y_size, 0);
  }

  return {batch_x_size, batch_y_size};
}

std::tuple<std::vector<std::int64_t>, std::vector<std::int64_t>>
batchShapes(torch::jit::Graph *graph, const std::vector<std::int64_t> &x_shape,
            const std::vector<std::int64_t> &y_shape,
            torch::jit::Value *&batch_x, torch::jit::Value *&batch_y) {
  std::vector<std::int64_t> batch_x_shape = shapeFromTensor(batch_x);
  std::vector<std::int64_t> batch_y_shape = shapeFromTensor(batch_y);

  if (batch_x_shape[0] == 0 && (batch_y_shape[0] != 0)) {
    batch_x_shape = {x_shape[0]};
    const std::vector<std::int64_t> data(batch_x_shape[0], 0);
    batch_x = createConstantLong(graph, data, batch_x_shape)->output();
  }
  if (batch_y_shape[0] == 0 && (batch_x_shape[0] != 0)) {
    batch_y_shape = {y_shape[0]};
    const std::vector<std::int64_t> data(batch_y_shape[0], 0);
    batch_y = createConstantLong(graph, data, batch_y_shape)->output();
  }

  return {batch_x_shape, batch_y_shape};
}

std::vector<std::int64_t> prepareInputTensor(torch::jit::Graph *graph,
                                             torch::jit::Value *&input) {
  auto input_shape = shapeFromTensor(input);
  if (input_shape.size() == 1) {
    const auto input_accum_shape = std::accumulate(
        input_shape.cbegin(), input_shape.cend(), 1, std::multiplies<size_t>());
    input_shape = std::vector<std::int64_t>{input_accum_shape, 1};
    input = createReshape(graph, input, input_shape)->output();
  }

  return input_shape;
}

void validateInputShapes(const std::vector<std::int64_t> &x_shape,
                         const std::vector<std::int64_t> &y_shape) {
  if (x_shape.size() > 2 || y_shape.size() > 2) {
    throw std::invalid_argument(
        "Inputs `x` and `y` should be max 2D tensors, while `x` has " +
        std::to_string(x_shape.size()) + " dims and `y` has " +
        std::to_string(y_shape.size()) + " dims.");
  }
  if (x_shape[1] != y_shape[1]) {
    throw std::invalid_argument(
        "Inputs shapes inconsistent x.shape[1]=" + std::to_string(x_shape[1]) +
        " vs. y.shape[1]=" + std::to_string(y_shape[1]));
  }
}

std::vector<std::int64_t> uniqueConsecutive(std::vector<std::int64_t> batch) {
  auto last = std::unique(batch.begin(), batch.end());
  batch.erase(last, batch.end());
  return batch;
}

void validateBatchIndices(const std::vector<std::int64_t> &batch_x,
                          const std::vector<std::int64_t> &batch_y) {
  const auto unique_batch_x = uniqueConsecutive(batch_x);
  const auto unique_batch_y = uniqueConsecutive(batch_y);

  if (unique_batch_x != unique_batch_y) {
    throw std::invalid_argument("Some batch indices occur in 'batch_x' "
                                "that do not occur in 'batch_y'");
  }
}

void validateSizes(std::int64_t x_size, std::int64_t y_size,
                   std::int64_t batch_x_size, std::int64_t batch_y_size) {
  if (x_size != batch_x_size) {
    throw std::invalid_argument("x.size(0) == batch_x.size(0)");
  }
  if (y_size != batch_y_size) {
    throw std::invalid_argument("y.size(0) == batch_y.size(0)");
  }
}

void validateShapes(const std::vector<std::int64_t> &x_shape,
                    const std::vector<std::int64_t> &y_shape,
                    const std::vector<std::int64_t> &batch_x_shape,
                    const std::vector<std::int64_t> &batch_y_shape) {
  if (batch_x_shape.size() != 1 || x_shape.front() != batch_x_shape.front()) {
    throw std::invalid_argument("x.size(0) == batch_x.size(0)");
  }
  if (batch_y_shape.size() != 1 || y_shape.front() != batch_y_shape.front()) {
    throw std::invalid_argument("y.size(0) == batch_y.size(0)");
  }
}

void rescaleInputs(torch::jit::Graph *graph, torch::jit::Value *&x,
                   torch::jit::Value *&y,
                   const std::vector<std::int64_t> &x_shape,
                   const std::vector<std::int64_t> &y_shape) {
  static constexpr bool keepdims = false;
  torch::jit::Value *const min_x =
      createReducemin(graph, {x}, {0, 1}, static_cast<int64_t>(keepdims))
          ->output();
  torch::jit::Value *const min_y =
      createReducemin(graph, {y}, {0, 1}, static_cast<int64_t>(keepdims))
          ->output();
  torch::jit::Value *const min_xy = createMin(graph, {min_x, min_y})->output();
  x = createSub(graph, {x, min_xy})->output();
  y = createSub(graph, {y, min_xy})->output();

  torch::jit::Value *const max_x =
      createReducemax(graph, {x}, {0, 1}, static_cast<int64_t>(keepdims))
          ->output();
  torch::jit::Value *const max_y =
      createReducemax(graph, {y}, {0, 1}, static_cast<int64_t>(keepdims))
          ->output();
  torch::jit::Value *const max_xy = createMax(graph, {max_x, max_y})->output();

  x = createDiv(graph, {x, max_xy})->output();
  x->setType(x->type()->expect<c10::TensorType>()->withSizes(x_shape));
  y = createDiv(graph, {y, max_xy})->output();
  y->setType(y->type()->expect<c10::TensorType>()->withSizes(y_shape));
}

void concatFeatures(torch::jit::Graph *graph, torch::jit::Value *&input,
                    const std::vector<std::int64_t> &input_shape,
                    std::vector<std::int64_t> &batch, std::int64_t D) {
  std::transform(batch.cbegin(), batch.cend(), batch.begin(),
                 [&D](std::int64_t value) { return 2 * D * value; });
  torch::jit::Value *batch_tensor =
      createConstantLong(graph, batch,
                         {static_cast<std::int64_t>(batch.size()), 1})
          ->output();
  input = createConcat(graph, {input, batch_tensor}, 1)->output();
  const std::vector<std::int64_t> concat_shape{input_shape[0],
                                               input_shape[1] + 1};
  input->setType(
      input->type()->expect<c10::TensorType>()->withSizes(concat_shape));
}

void concatFeatures(torch::jit::Graph *graph, torch::jit::Value *&input,
                    const std::vector<std::int64_t> &input_shape,
                    torch::jit::Value *&batch, std::int64_t batch_size,
                    std::int64_t D) {
  const std::vector<std::int64_t> data(batch_size, 2 * D);
  const std::vector<std::int64_t> batch_shape{batch_size, 1};
  torch::jit::Value *multiplier =
      createConstantInt(graph, data, batch_shape)->output();
  batch = createReshape(graph, batch, batch_shape)->output();
  batch = createMul(graph, {multiplier, batch})->output();
  input = createConcat(graph, {input, batch}, 1)->output();
  const std::vector<std::int64_t> concat_shape{input_shape[0],
                                               input_shape[1] + 1};
  input->setType(
      input->type()->expect<c10::TensorType>()->withSizes(concat_shape));
}

torch::jit::Node *vq(torch::jit::Graph *graph, torch::jit::Value *const x,
                     torch::jit::Value *const y) {
  auto *const p = createConstantFloat32(graph, {2.0}, {1});
  auto *const distances = createHandlerOperation(
      graph, getHandler(c10::aten::cdist), {x, y, p->output()});
  return createArgmin(graph, {distances->output()}, 1 /*axis*/, 0 /*keepdims*/);
}

torch::jit::Node *nearestBatchListHandler(torch::jit::Graph *graph,
                                          torch::jit::Node *node) {
  torch::jit::Value *x = node->input(0);
  torch::jit::Value *y = node->input(1);

  std::vector<std::int64_t> batch_x = constantToLongVec(node->input(2)->node());
  std::vector<std::int64_t> batch_y = constantToLongVec(node->input(3)->node());

  const auto x_shape = prepareInputTensor(graph, x);
  const auto y_shape = prepareInputTensor(graph, y);

  validateInputShapes(x_shape, y_shape);

  const auto [batch_x_size, batch_y_size] = batchSizes(x, y, batch_x, batch_y);

  if ((batch_x_size != 0) && (batch_y_size != 0)) {
    validateBatchIndices(batch_x, batch_y);
    validateSizes(x_shape[0], y_shape[0], batch_x_size, batch_y_size);

    rescaleInputs(graph, x, y, x_shape, y_shape);

    const std::int64_t d = x_shape.back();
    concatFeatures(graph, x, x_shape, batch_x, d);
    concatFeatures(graph, y, y_shape, batch_y, d);
  }

  return vq(graph, x, y);
}

torch::jit::Node *nearestHandler(torch::jit::Graph *graph,
                                 torch::jit::Node *node) {
  torch::jit::Value *x = node->input(0);
  torch::jit::Value *y = node->input(1);
  torch::jit::Value *batch_x = node->input(2);
  torch::jit::Value *batch_y = node->input(3);

  const auto x_shape = prepareInputTensor(graph, x);
  const auto y_shape = prepareInputTensor(graph, y);

  validateInputShapes(x_shape, y_shape);

  const auto [batch_x_shape, batch_y_shape] =
      batchShapes(graph, x_shape, y_shape, batch_x, batch_y);

  if (!batch_x_shape.empty() && !batch_y_shape.empty()) {
    // No validation of batch indices as we can't assert from Poplar
    validateShapes(x_shape, y_shape, batch_x_shape, batch_y_shape);

    rescaleInputs(graph, x, y, x_shape, y_shape);

    const std::int64_t d = x_shape.back();
    concatFeatures(graph, x, x_shape, batch_x, batch_x_shape[0], d);
    concatFeatures(graph, y, y_shape, batch_y, batch_y_shape[0], d);
  }

  return vq(graph, x, y);
}

} // namespace

__attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() {
  registerHandler(poptorch::symbols::poptorch::nearest, nearestHandler);
  registerHandler(poptorch::symbols::poptorch::nearest_batch_list,
                  nearestBatchListHandler);
}

} // namespace poptorch


================================================
FILE: poptorch/source/type_and_constant_canonicalization/AddListNumElements.cpp
================================================
// Copyright (c) 2021 Graphcore Ltd. All rights reserved.

#include <torch/csrc/jit/ir/ir.h>

#include <memory>
#include <stack>

#include "poptorch/Utils.hpp"
#include "poptorch_logging/Error.hpp"

namespace poptorch {
namespace type_and_constant_canonicalization {

namespace {
void recursivelySwitchType(torch::jit::Node *node,
                           const torch::jit::TypePtr &new_type) {
  for (auto use : node->output()->uses()) {
    ERROR_ON(use.user->kind() == c10::prim::ListConstruct);

    // No known JIT model causes this, but one may emerge in which case
    // this algorithm will need to handle it.
    ERROR_ON(use.user->kind() == c10::prim::TupleUnpack);

    if (use.user->kind() == c10::prim::TupleConstruct) {
      const auto &tuple_elements =
          use.user->output()->type()->expect<c10::TupleType>()->elements();

      std::vector<c10::TypePtr> new_types;
      new_types.reserve(tuple_elements.size());
      std::copy(tuple_elements.begin(), tuple_elements.end(),
                std::back_inserter(new_types));

      // This will be the list or nested tuple containing list
      new_types[use.offset] = new_type;

      auto new_tuple_type = c10::TupleType::create(new_types);

      use.user->output()->setType(new_tuple_type);
      recursivelySwitchType(use.user, new_tuple_type);
    }
  }
}

} // namespace

void addListNumElements(torch::jit::Graph *graph, bool revert) {
  logging::LogContext ctx_func("addListNumElements");
  for (torch::jit::Node *node : graph->nodes()) {
    logging::LogContext ctx("processing " + nodeToString(node));

    if (node->kind() == c10::prim::ListConstruct) {
      auto list_inputs = node->inputs();

      // Lists should never be nested as the JIT tracer does not support,
      // but always good to check in case.
      for (auto *input : list_inputs) {
        ERROR_ON(input->type()->kind() == c10::TypeKind::ListType);
      }

      c10::TypePtr new_type;
      if (revert) {
        // Revert back to the orgiinal type
        auto lot_type =
            node->output()->type()->expect<ListTypeWithNumElements>();
        new_type = lot_type->getOriginalListType();
      } else {
        // Switch to a ListTypeWithNumElements
        auto orig_type = node->output()->type()->expect<c10::ListType>();
        auto num_elements = list_inputs.size();
        new_type = std::make_shared<ListTypeWithNumElements>(
            orig_type->getElementType(), num_elements);
      }

      node->output()->setType(new_type);

      // Any tuples which have te list need fixing.
      recursivelySwitchType(node, new_type);
    }
  }
}

} // namespace type_and_constant_canonicalization
} // namespace poptorch


================================================
FILE: poptorch/source/type_and_constant_canonicalization/CanonicaliseConstants.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.

#include <ATen/ATen.h>
#include <torch/csrc/jit/ir/ir.h>
#include <torch/csrc/jit/passes/dead_code_elimination.h>

#include <any>
#include <functional>
#include <iterator>
#include <limits>
#include <stack>
#include <utility>

#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

#include "poptorch/DispatchTracer.hpp"
#include "poptorch/OpBuilder.hpp"
#include "poptorch/TypeAndConstantCanonicalization.hpp"
#include "poptorch/Utils.hpp"

#include "../PoptorchSymbols.hpp"

namespace poptorch {
namespace type_and_constant_canonicalization {
namespace {

// Returns true for node kinds which change compiler state. These need to be
// removed for any host side tensors but otherwise does not make connected
// node a PopART only node.
bool compilerStateChangingKind(const torch::jit::NodeKind &kind) {
  return (kind == symbols::poptorch::begin_ipu_block ||
          kind == symbols::poptorch::end_ipu_block ||
          kind == symbols::poptorch::set_available_memory ||
          kind == symbols::poptorch::push_name_scope ||
          kind == symbols::poptorch::set_matmul_serialization);
}

bool popartOnlyNode(const torch::jit::NodeKind &kind) {
  return (!compilerStateChangingKind(kind) && kind != c10::prim::Constant &&
          kind != c10::prim::TupleConstruct &&
          kind != c10::prim::ListConstruct && kind != c10::prim::TupleUnpack &&
          kind != c10::prim::ListUnpack && kind != c10::prim::Return);
}

// Check whether the node is (eventually) used host side, IPU or both
UseOfNode getUseOfNode(torch::jit::Node *n,
                       bool check_node_kind_itself = true) {
  // Check the kind of the node itself (for when not called on a prim constant).
  // This could be disabled explicitly by the caller.
  if (check_node_kind_itself && popartOnlyNode(n->kind())) {
    return UseOfNode::PopARTOnly;
  }
  if (check_node_kind_itself && n->kind() == c10::prim::Return) {
    return UseOfNode::HostSideOnly;
  }

  bool popart_use = false;
  bool host_use = false;

  // Check all outputs
  std::vector<torch::jit::Node *> to_check;
  to_check.push_back(n);
  while (!to_check.empty()) {
    auto *cur_node = to_check.back();
    to_check.pop_back();
    for (auto *output : cur_node->outputs()) {
      for (auto use : output->uses()) {
        auto use_kind = use.user->kind();
        if (use_kind == c10::prim::Return) {
          // This must be host use as we have not reached an op which would be
          // run on popart yet.
          host_use = true;
        } else if (popartOnlyNode(use_kind) ||
                   use_kind == symbols::poptorch::set_available_memory ||
                   use_kind == symbols::poptorch::set_matmul_serialization) {
          popart_use = true;
        } else {
          // We only need to check the node further if it is neither returned
          // nor used by a Popart op
          to_check.push_back(use.user);
        }
      }
    }
  }

  if (!host_use && !popart_use) {
    // Some nodes such as begin_ipu_block will simply remove the tensor so make
    // it a default tensor_constant for simplicity.
    return UseOfNode::PopARTOnly;
  }

  if (host_use && popart_use) {
    return UseOfNode::HostSideAndPopART;
  }
  if (host_use) {
    return UseOfNode::HostSideOnly;
  }
  return UseOfNode::PopARTOnly;
}

void replaceWithConstantTensor(torch::jit::Graph *graph, torch::jit::Node *n,
                               const at::Tensor &t) {
  ERROR_ON(n->kind() != c10::prim::Constant);
  torch::jit::WithInsertPoint const insert_point(n);
  const WithNodeMetadata meta(n);

  poptorch::UseOfNode const use_of_node = getUseOfNode(n);
  auto *new_node = tensorToConstant(graph, t, use_of_node);

  for (size_t use_idx = 0; use_idx < n->output()->uses().size(); use_idx++) {
    auto u = n->output()->uses()[use_idx];
    u.user->replaceInput(u.offset, new_node->output());
    use_idx--;
  }
}

void warnDoubleOutOfRange(double val, torch::jit::Node *n) {
  if (val > std::numeric_limits<float>::max() ||
      val < std::numeric_limits<float>::lowest()) {
    static std::uint64_t log_repeat = 0;
    logging::warn(log_repeat,
                  "{}: torch.float64 constant cannot be "
                  "represented as a torch.float32",
                  nodeToString(n));
  }
}

void warnLongOutOfRange(int64_t val, torch::jit::Node *n) {
  if (val > std::numeric_limits<int32_t>::max() ||
      val < std::numeric_limits<int32_t>::lowest()) {
    static std::uint64_t log_repeat = 0;
    logging::warn(log_repeat,
                  "{}: torch.int64 constant cannot be represented "
                  "as a torch.int32",
                  nodeToString(n));
  }
}

void handleNumberConstant(torch::jit::Graph *graph, torch::jit::Node *n) {
  if (n->output()->type()->isSubtypeOf(c10::BoolType::get())) {
    replaceWithConstantTensor(
        graph, n,
        at::native::scalar_tensor(*torch::jit::constant_as<bool>(n->output()),
                                  at::kInt, c10::nullopt, at::kCPU));
  } else {
    auto so = torch::jit::constant_as<at::Scalar>(n->output());
    ERROR_ON(!so.has_value());
    auto s = *so;

    c10::ScalarType dtype;
    if (s.isFloatingPoint()) {
      warnDoubleOutOfRange(s.toDouble(), n);
      dtype = at::kFloat;
    } else if (s.isIntegral(false)) {
      dtype = at::kInt;

      // Handle magic number 9223372036854775807
      if (s.toLong() == std::numeric_limits<int64_t>::max()) {
        s = std::numeric_limits<int32_t>::max();
        logging::info("{}: Using max value for torch.int32 in place of max "
                      "value for torch.int64",
                      nodeToString(n));
      } else {
        warnLongOutOfRange(s.toLong(), n);
      }
    } else {
      ERROR("Unsupported constant type");
    }

    auto wrapped_number =
        at::native::scalar_tensor(s, dtype, c10::nullopt, at::kCPU);
    wrapped_number.unsafeGetTensorImpl()->set_wrapped_number(true);
    replaceWithConstantTensor(graph, n, wrapped_number);
  }
}

void handleTensorConstant(torch::jit::Graph *graph, torch::jit::Node *n) {
  auto tensor_type = n->output()->type()->expect<c10::TensorType>();
  if (!tensor_type->scalarType().has_value()) {
    ERROR("Tensor constant without type");
  }

  at::Tensor tensor;
  if (n->kindOf(c10::attr::value) == torch::jit::AttributeKind::ts) {
    tensor = getNodeTensorAttrValue(n);
  } else {
    ERROR_ON_MSG(n->kindOf(c10::attr::value) != torch::jit::AttributeKind::t,
                 "[Internal] expected type 't' or 'ts' but got "
                     << torch::jit::toString(n->kindOf(c10::attr::value)));
    tensor = n->t(c10::attr::value);
  }
  ERROR_ON(!tensor.defined());
  const bool was_wrapped = tensor.unsafeGetTensorImpl()->is_wrapped_number();
  if (tensor.scalar_type() == at::ScalarType::Double) {
    warnDoubleOutOfRange(
        *reinterpret_cast<double *>(tensor.unsafeGetTensorImpl()->data()), n);
    tensor = tensor.to(at::ScalarType::Float);
  }
  if (tensor.scalar_type() == at::ScalarType::Long) {
    warnLongOutOfRange(
        *reinterpret_cast<int64_t *>(tensor.unsafeGetTensorImpl()->data()), n);

    tensor = tensor.to(at::ScalarType::Int);
  }

  // This gets lost in conversion and changes implicit casting if not set
  // (Must use an if as set_wrapped_number(false) can cause a PyTorch internal
  // error)
  if (was_wrapped) {
    tensor.unsafeGetTensorImpl()->set_wrapped_number(true);
  }

  replaceWithConstantTensor(graph, n, tensor);
}

void handleStringConstant(torch::jit::Graph *graph, torch::jit::Node *n) {
  std::string const s = n->s(c10::attr::value);
  std::vector<int64_t> shape_vec;
  shape_vec.push_back(s.length());

  auto t =
      at::empty({shape_vec}, at::dtype(at::ScalarType::Char)
                                 .memory_format(c10::MemoryFormat::Contiguous));

  std::memcpy(t.data_ptr(), s.c_str(), s.length() * sizeof(char));
  replaceWithConstantTensor(graph, n, t);
}

// Visit an ivalue which is a tuple or list constant and single type constant
// nodes and list/tuple constructs to replace it
class ListTupleVisitor {
  enum class State { IN_TUPLE, IN_LIST };

  // Maintain the information about the list or tuple at each level
  struct ListOrTupleInfo {
    ListOrTupleInfo(State state_, size_t elements_left_,
                    c10::TypePtr container_type_)
        : state(state_), elements_left(elements_left_),
          container_type(std::move(container_type_)) {}

    // Whether or not the visitor is currently in a list or a tuple
    State state;

    // The number of elenents left to be visited (before a List/TupleConstruct)
    size_t elements_left;

    // The type of the list/tuple, preserved from first visit ahead of
    // constructing the list or tuple
    c10::TypePtr container_type;

    // All the nodes to be input to the List/TupleConstruct
    std::vector<torch::jit::Node *> container_nodes;
  };

public:
  explicit ListTupleVisitor(torch::jit::Graph *graph)
      : _graph(graph), _last_node(nullptr) {}

  // We never return true as we visit every element
  bool operator()(const c10::IValue &i_value) {
    if (i_value.isGenericDict()) {
      ERROR("Dicts are not supported in constant canonicalisation.");
    }

    // Handle the visting of a list or tuple: actual creation will happen
    // once all its elements have been visited
    if (i_value.isTuple() || i_value.isList()) {
      handleListOrTuple(i_value);
      return false;
    }

    // Handle an element which is not a tuple or list
    handleConstant(i_value);

    // There will not be a further visit marking the completition of a tuple
    // or list, so this must be handled after the final constant.
    // In addition, in a nested scenario, this might trigger for then once
    // e.g. (1, (2, (3, 4))) will lead this block running three times.

    while (_info_stack.top().elements_left == 0) {
      handleTupleOrListConstruction();

      if (_info_stack.empty()) {
        // All tuples and lists have been constructed
        break;
      }
    }

    return false;
  }

  const std::vector<torch::jit::Node *> &getAllConstNodes() {
    return _all_const_nodes;
  }

  torch::jit::Node *getLastNode() {
    if (_last_node == nullptr) {
      // There is no last node: it means the list or tuple construction hasn't
      // been triggered (For example if it's an empty list/tuple).
      handleTupleOrListConstruction();
      ERROR_ON(_last_node == nullptr);
    }
    return _last_node;
  }

private:
  // Handle a list of the tuple: this involves merely recording the state, type
  // and number of elements as the inputs to a List/TupleConstruct will not have
  // been constructed at this point.
  void handleListOrTuple(const c10::IValue &i_value) {
    if (i_value.isTuple()) {
      _info_stack.emplace(State::IN_TUPLE, i_value.toTuple()->elements().size(),
                          i_value.type());
    } else {
      _info_stack.emplace(State::IN_LIST, i_value.toListRef().size(),
                          i_value.type());
    }
  }

  // Handle a tensor or numeric constant. This adds a constant of the same type
  // to the graph, which will later be canonicalised to a tensor constant.
  // Though this means that there will be an extra canonicalisation step, it
  // minimises code duplication. All constants are added to "_all_const_nodes"
  // for the later canonicalisation.
  void handleConstant(const c10::IValue &i_value) {
    ERROR_ON(_info_stack.empty());

    auto *new_const = _graph->create(c10::prim::Constant);

    if (i_value.isTensor()) {
      new_const->output()->inferTypeFrom(i_value.toTensor());
      setNodeTensorAttrValue(new_const, i_value.toTensor());
    } else if (i_value.isInt()) {
      new_const->output()->setType(c10::IntType::get());
      new_const->i_(c10::attr::value, i_value.toInt());
    } else if (i_value.isDouble()) {
      new_const->output()->setType(c10::FloatType::get());
      new_const->f_(c10::attr::value, i_value.toDouble());
    } else if (i_value.isBool()) {
      new_const->output()->setType(c10::BoolType::get());
      new_const->i_(
          c10::attr::value,
          static_cast<torch::jit::IntAttr::ConstructorType>(i_value.toBool()));
    } else if (i_value.isNone()) {
      // Assign NoneType so that the node can be skipped over
      // during constant canonicalization
      new_const->output()->setType(c10::NoneType::get());
    } else {
      ERROR("Unsupported type for constant: " << i_value);
    }

    insertNodeInGraph(_graph, new_const);
    _info_stack.top().container_nodes.push_back(new_const);
    _all_const_nodes.push_back(new_const);
    _info_stack.top().elements_left--;
  }

  // Handle the actual constructions of a list or tuple once the last element
  // has been visited.
  void handleTupleOrListConstruction() {
    torch::jit::Node *construct_node;

    switch (_info_stack.top().state) {
    case State::IN_TUPLE:
      construct_node = _graph->create(c10::prim::TupleConstruct);
      break;
    case State::IN_LIST:
      construct_node = _graph->create(c10::prim::ListConstruct);
      break;
    default:
      ERROR("Unreachable");
    }

    for (auto *element : _info_stack.top().container_nodes) {
      construct_node->addInput(element->output());
    }
    construct_node->output()->setType(_info_stack.top().container_type);
    insertNodeInGraph(_graph, construct_node);

    _info_stack.pop();

    if (!_info_stack.empty()) {
      ERROR_ON(_info_stack.top().elements_left < 1);
      _info_stack.top().elements_left--;

      // The container is itself an element of the previous container
      _info_stack.top().container_nodes.push_back(construct_node);
    } else {
      // Store the final node for access outside the visit
      _last_node = construct_node;
    }
  }

  torch::jit::Graph *_graph;
  std::stack<ListOrTupleInfo> _info_stack;
  std::vector<torch::jit::Node *> _all_const_nodes;
  torch::jit::Node *_last_node;
};

void handleListOrTuple(torch::jit::Graph *graph, torch::jit::Node *n,
                       std::unordered_set<torch::jit::Node *> *to_delete) {
  torch::jit::WithInsertPoint const insert_point(n);
  const WithNodeMetadata meta(n);

  // Use the visitor to turn the single list/tuple constant into many
  // constants and List/TupleConstructs.
  ListTupleVisitor visitor(graph);
  const auto &tuple_ivalue = n->ival(c10::attr::value);
  tuple_ivalue.visit(std::function<bool(const c10::IValue &)>(
      std::reference_wrapper(visitor)));

  // Find the very last node added and use it to replace the original node
  auto *replacement_node = visitor.getLastNode();
  auto *replacement_node_out = replacement_node->output();
  replacement_node_out->setType(n->output()->type());
  n->output()->replaceAllUsesWith(replacement_node_out);

  // The nodes added in the visitor match those of constants not in lists/tuples
  // *before* canonicalisation (to permit code reuse). Hence, we canonicalise
  // in the same way.
  for (auto *prim_const : visitor.getAllConstNodes()) {
    torch::jit::WithInsertPoint const insert_point_prim_const(prim_const);
    const WithNodeMetadata prim_meta(prim_const);

    // If there are NoneTypes we can skip those
    if (prim_const->output()->type() != c10::NoneType::get()) {
      if (prim_const->output()->type()->isSubtypeOf(c10::TensorType::get())) {
        handleTensorConstant(graph, prim_const);
      } else {
        handleNumberConstant(graph, prim_const);
      }
      to_delete->insert(prim_const);
    }
  }
}

void recursivelySelectHostAndIPUSideConstants(
    torch::jit::Node *node_to_process, torch::jit::Node *host_side_replacement,
    torch::jit::Node *ipu_side_replacement,
    std::unordered_set<torch::jit::Node *> *to_delete) {
  for (size_t output_idx = 0; output_idx < node_to_process->outputs().size();
       output_idx++) {
    auto *output = node_to_process->output(output_idx);

    while (!output->uses().empty()) {
      auto use = output->uses()[0];
      switch (getUseOfNode(use.user)) {
      case UseOfNode::HostSideOnly:
        use.user->replaceInput(use.offset,
                               host_side_replacement->output(output_idx));
        break;
      case UseOfNode::PopARTOnly:
        use.user->replaceInput(use.offset,
                               ipu_side_replacement->output(output_idx));
        break;
      case UseOfNode::HostSideAndPopART:
        auto *graph = use.user->owningGraph();
        torch::jit::WithInsertPoint const insert_point(use.user);
        const WithNodeMetadata meta(use.user);

        auto same_value = [](torch::jit::Value *value) { return value; };

        auto *host_side_node = graph->createClone(use.user, same_value);
        host_side_node->replaceInput(use.offset,
                                     host_side_replacement->output(output_idx));

        insertNodeInGraph(graph, host_side_node);

        auto *ipu_side_node = graph->createClone(use.user, same_value);
        ipu_side_node->replaceInput(use.offset,
                                    ipu_side_replacement->output(output_idx));
        insertNodeInGraph(graph, ipu_side_node);

        recursivelySelectHostAndIPUSideConstants(use.user, host_side_node,
                                                 ipu_side_node, to_delete);

        to_delete->insert(use.user);

        // Prevent further cloning
        while (!use.user->inputs().empty()) {
          use.user->removeInput(0);
        }
        break;
      }
    }
  }
}

// Find any host_and_ipu_side_tensor_constant constants and perform the
// necessary splitting
void rectifyHostAndIPUSideConstants(
    torch::jit::Graph *graph,
    std::unordered_set<torch::jit::Node *> *to_delete) {
  logging::LogContext const ctx_func("rectifyHostAndIPUSideConstants");
  for (auto *node : graph->nodes()) {
    logging::LogContext const ctx("processing " + nodeToString(node));

    if (node->kind() != symbols::poptorch::host_and_ipu_side_tensor_constant) {
      continue;
    }

    // Create two new nodes
    auto t = getNodeTensorAttrValue(node);
    torch::jit::WithInsertPoint const insert_point(node);
    const WithNodeMetadata meta(node);

    torch::jit::Node *host_side_node = createAndInsertNode(
        graph, symbols::poptorch::host_side_tensor_constant);
    host_side_node->output()->inferTypeFrom(t);
    setNodeTensorAttrValue(host_side_node, t);

    torch::jit::Node *ipu_node =
        createAndInsertNode(graph, symbols::poptorch::tensor_constant);
    ipu_node->output()->inferTypeFrom(t);
    setNodeTensorAttrValue(ipu_node, t);

    recursivelySelectHostAndIPUSideConstants(node, host_side_node, ipu_node,
                                             to_delete);

    to_delete->insert(node);
  }
}

void removeStateChangingNodesFromHostSideBranch(
    torch::jit::Graph *graph,
    std::unordered_set<torch::jit::Node *> *to_delete) {
  logging::LogContext const ctx_func(
      "removeStateChangingNodesFromHostSideBranch");
  for (auto *node : graph->nodes()) {
    logging::LogContext const ctx("processsing " + nodeToString(node));
    if (node->kind() != symbols::poptorch::host_side_tensor_constant) {
      continue;
    }

    std::vector<torch::jit::Node *> to_process;
    to_process.push_back(node);
    while (!to_process.empty()) {
      auto *cur_node = to_process.back();
      to_process.pop_back();

      auto outputs = cur_node->outputs();
      for (auto *output : outputs) {
        for (auto use : output->uses()) {
          to_process.push_back(use.user);
        }
      }

      if (!compilerStateChangingKind(cur_node->kind())) {
        continue;
      }

      // The number of outputs may be less e.g. begin_ipu_block, but otherwise
      // any output to be replaced must match the input for this to work.
      for (size_t output_idx = 0; output_idx < cur_node->outputs().size();
           output_idx++) {
        cur_node->output(output_idx)
            ->replaceAllUsesWith(cur_node->input(output_idx));
      }

      to_delete->insert(cur_node);
    }
  }
}

void canonicaliseIfConstant(torch::jit::Graph *graph, torch::jit::Node *node,
                            std::unordered_set<torch::jit::Node *> *to_delete) {
  logging::LogContext const ctx("processing " + nodeToString(node));

  if (node->kind() == c10::aten::size) {
    // This will be made a constant in the size handler
    node->output()->setType(
        c10::TensorType::create(c10::ScalarType::Int, c10::nullopt, 1, false));
  }

  // If it's not a constant or if it doesn't have a value (i.e is None) or if
  // it's a Device
  if (node->kind() != c10::prim::Constant ||
      !node->hasAttribute(c10::attr::value) ||
      node->output()->type()->isSubtypeOf(c10::DeviceObjType::get())) {
    return;
  }

  if (node->output()->type()->isSubtypeOf(c10::NumberType::get()) ||
      node->output()->type()->isSubtypeOf(c10::BoolType::get())) {
    logging::LogContext const ctx2("handling as number constant");
    handleNumberConstant(graph, node);
  } else if (node->output()->type()->isSubtypeOf(c10::TensorType::get())) {
    logging::LogContext const ctx2("handling as tensor constant");
    handleTensorConstant(graph, node);
  } else if (node->output()->type()->isSubtypeOf(c10::StringType::get())) {
    logging::LogContext const ctx2("handling as string constant");
    handleStringConstant(graph, node);
  } else if (node->output()->type()->isSubtypeOf(c10::ListType::ofBools())) {
    // Only known case is the result of an evaluated constexpr
    logging::LogContext const ctx2("handling as bool list constant");
    handleListOrTuple(graph, node, to_delete);
  } else if (node->output()->type()->isSubtypeOf(c10::ListType::ofFloats())) {
    // Only known case is the result of an evaluated constexpr
    logging::LogContext const ctx2("handling as float list constant");
    handleListOrTuple(graph, node, to_delete);
  } else if (node->output()->type()->isSubtypeOf(c10::ListType::ofInts())) {
    // Only known case is the result of an evaluated constexpr
    logging::LogContext const ctx2("handling as int list constant");
    handleListOrTuple(graph, node, to_delete);
  } else if (node->output()->type()->isSubtypeOf(c10::ListType::ofTensors())) {
    // Only known case is the result of an evaluated constexpr
    logging::LogContext const ctx2("handling a tensor list constant");
    handleListOrTuple(graph, node, to_delete);
  } else if (node->output()->type()->isSubtypeOf(c10::ListType::create(
                 c10::OptionalType::create(c10::TensorType::get())))) {
    logging::LogContext const ctx2("handling an optional tensor list constant");
    handleListOrTuple(graph, node, to_delete);
  } else if (node->output()->type()->cast<c10::TupleType>()) {
    handleListOrTuple(graph, node, to_delete);
  } else {
    ERROR("Unsupported type " << node->output()->type()->str());
  }

  to_delete->insert(node);
}

} // namespace

void canonicaliseConstants(torch::jit::Graph *graph) {
  logging::LogContext const ctx_func("CanonicaliseConstants");
  std::unordered_set<torch::jit::Node *> to_delete;

  for (auto *node : graph->nodes()) {
    canonicaliseIfConstant(graph, node, &to_delete);
  }

  searchAndPossiblyDestroy(to_delete);
  to_delete.clear();

  rectifyHostAndIPUSideConstants(graph, &to_delete);
  searchAndPossiblyDestroy(to_delete);

  to_delete.clear();
  removeStateChangingNodesFromHostSideBranch(graph, &to_delete);
  searchAndPossiblyDestroy(to_delete);
}

} // namespace type_and_constant_canonicalization
} // namespace poptorch


================================================
FILE: poptorch/source/type_and_constant_canonicalization/CastUnsupportedInputs.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.

#include "torch/csrc/jit/ir/ir.h"

#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

#include "poptorch/OpBuilder.hpp"
#include "poptorch/TypeAndConstantCanonicalization.hpp"
#include "poptorch/Utils.hpp"

#include "../PoptorchSymbols.hpp"

namespace poptorch {
namespace type_and_constant_canonicalization {
namespace {
void processInputTensor(torch::jit::Graph *graph, torch::jit::Value *input) {
  auto tensor_type = input->type()->expect<c10::TensorType>();
  auto current_type = tensor_type->scalarType().value();

  at::ScalarType new_type = coerceToSupportedType(current_type);

  if (current_type == at::ScalarType::BFloat16) {
    new_type = at::ScalarType::Half;
  } else if (new_type == current_type) {
    // No need for a host side cast
    return;
  }

  auto *earliest_user = findEarliestUser(input);
  if (earliest_user == nullptr) {
    logging::warn("Graph contains an unused input %{} : {}", input->debugName(),
                  *tensor_type);
    return;
  }

  // This is an identity op but used just to make sure the implicit cast
  // does not end up promoting to a Double/Long
  auto *new_node = graph->create(symbols::poptorch::host_side_cast);

  insertNodeBeforeNode(new_node, earliest_user);
  input->replaceAllUsesWith(new_node->output());
  new_node->addInput(input);

  new_node->output()->setType(tensor_type->withScalarType(new_type));
}
} // namespace

void castUnsupportedInputs(torch::jit::Graph *graph) {
  auto collapsed_inputs = collapsedGraphInputHierachy(graph);

  for (auto *input : collapsed_inputs) {
    if (input != nullptr) {
      processInputTensor(graph, input);
    }
  }
}

} // namespace type_and_constant_canonicalization
} // namespace poptorch


================================================
FILE: poptorch/source/type_and_constant_canonicalization/CheckAndChangeOutputTypes.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.

#include <torch/csrc/jit/ir/ir.h>

#include <sstream>

#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

#include "poptorch/OpBuilder.hpp"
#include "poptorch/TypeAndConstantCanonicalization.hpp"
#include "poptorch/Utils.hpp"

#include "../PoptorchSymbols.hpp"

namespace poptorch {
namespace type_and_constant_canonicalization {

namespace {
constexpr bool supportedType(const at::ScalarType type) {
  return (type == at::ScalarType::Int || type == at::ScalarType::Long ||
          type == at::ScalarType::Half || type == at::ScalarType::Float ||
          type == at::ScalarType::Double || type == at::ScalarType::Bool ||
          type == at::ScalarType::BFloat16 || type == at::ScalarType::Char ||
          type == at::ScalarType::Byte || type == at::ScalarType::Short);
}

bool isBeforeHostSideCast(const torch::jit::Node *n) {
  if (n->kind() == c10::prim::TupleUnpack ||
      n->kind() == c10::prim::ListUnpack) {
    // Recurse through unpacks until we find a host_side_cast or otherwise
    // return false
    for (const torch::jit::Value *output : n->outputs()) {
      if (output->uses().size() != 1) {
        continue;
      }
      if (isBeforeHostSideCast(output->uses()[0].user)) {
        return true;
      }
    }
  }

  // Otherwise, the presence or lack of a host_side_cast will indicate whether
  // to return true or false
  return n->kind() == symbols::poptorch::host_side_cast;
}

void warnNonNativeSupport(torch::jit::Node *node,
                          const char *unsupported_type) {
  // Ignore nodes for which the type is inconsequential
  if (node->kind() == c10::aten::argmax || node->kind() == c10::aten::argmin ||
      node->kind() == c10::aten::contiguous ||
      node->kind() == c10::aten::chunk || node->kind() == c10::aten::detach ||
      node->kind() == c10::aten::expand ||
      node->kind() == c10::aten::expand_as ||
      node->kind() == c10::aten::flatten || node->kind() == c10::aten::ones ||
      node->kind() == c10::aten::ones || node->kind() == c10::aten::permute ||
      node->kind() == c10::aten::reshape || node->kind() == c10::aten::roll ||
      node->kind() == c10::aten::select || node->kind() == c10::aten::slice ||
      node->kind() == c10::aten::split || node->kind() == c10::aten::stack ||
      node->kind() == c10::aten::squeeze ||
      node->kind() == c10::aten::transpose ||
      node->kind() == c10::aten::unsqueeze ||
      node->kind() == c10::aten::upsample_nearest1d ||
      node->kind() == c10::aten::upsample_nearest2d ||
      node->kind() == c10::aten::upsample_nearest3d ||
      node->kind() == c10::aten::upsample_linear1d ||
      node->kind() == c10::aten::upsample_bilinear2d ||
      node->kind() == c10::aten::upsample_trilinear3d ||
      node->kind() == c10::aten::upsample_bicubic2d ||
      node->kind() == c10::aten::view || node->kind() == c10::aten::zeros ||
      node->kind() == c10::prim::NumToTensor) {
    return;
  }

  static std::unordered_set<std::string> warned_types;
  if (warned_types.find(unsupported_type) == warned_types.end()) {
    logging::warn(
        "{}: {} is not supported natively on IPU, loss of "
        "range/precision may occur. We will only warn on the first instance.",
        nodeToString(node), unsupported_type);
    warned_types.insert(unsupported_type);
  }
}

void maybeReplaceOutputType(torch::jit::Node *node, torch::jit::Value *output,
                            c10::TensorType *current_type,
                            const at::ScalarType unsupported_dtype,
                            const at::ScalarType replacement_dtype,
                            const char *torch_type_str) {
  if (current_type->scalarType() != unsupported_dtype) {
    return;
  }

  // Constants will be retyped later
  if (node->kind() != c10::prim::Constant) {
    warnNonNativeSupport(node, torch_type_str);
    output->setType(current_type->withScalarType(replacement_dtype));
  }

  // Ensure no casting to it
  if (node->kind() == c10::aten::to) {
    // Possible locations of dtype int depending on the aten::to arity
    auto num_inputs = node->inputs().size();
    size_t dtype_index = 0;

    if (num_inputs == 5 || num_inputs == 8) {
      dtype_index = 1;
    } else if (num_inputs == 6) {
      dtype_index = 2;
    } else {
      // Must be another aten::to signature
      return;
    }

    auto int_type = node->input(dtype_index)->type()->cast<c10::IntType>();
    ERROR_ON_MSG(!int_type, "Expected integer type as dtype input at index "
                                << dtype_index << " for "
                                << nodeToString(node));

    auto replacement = static_cast<int>(replacement_dtype);
    auto *input = node->input(dtype_index)->node();

    if (node->input(dtype_index)->uses().size() == 1) {
      // Type constant is only used once, change its value
      input->i_(c10::attr::value, replacement);
    } else {
      // Create a new constant as the constant is used elsewhere
      auto no_inputs = [](torch::jit::Value *value) {
        ERROR("A constant should have no inputs");
        return value; // ensures correct output type
      };
      auto *new_node = node->owningGraph()->createClone(input, no_inputs);
      new_node->i_(c10::attr::value, replacement);
      node->replaceInput(dtype_index, new_node->output());

      insertNodeBeforeNode(new_node, node);
    }

    logging::info("Replacing cast to {} with cast to {} for {}",
                  c10::toString(unsupported_dtype),
                  c10::toString(replacement_dtype), nodeToString(node));
  }
}

void checkAndChangeOutputTypesForOutput(torch::jit::Node *node,
                                        torch::jit::Value *output) {
  auto tensor_type = output->type()->cast<c10::TensorType>();

  // Ignore other return types e.g.  NumberTypes for constants
  if (!tensor_type) {
    return;
  }

  ERROR_ON_MSG(!tensor_type->scalarType().has_value(),
               "Returning an unknown tensor dtype is not supported.\n");

  ERROR_ON_MSG(!supportedType(*tensor_type->scalarType()),
               "Returning a torch." << c10::toString(*tensor_type->scalarType())
                                    << " is not supported.\n");

  maybeReplaceOutputType(node, output, tensor_type.get(),
                         at::ScalarType::Double, at::ScalarType::Float,
                         "torch.float64");
  maybeReplaceOutputType(node, output, tensor_type.get(), at::ScalarType::Long,
                         at::ScalarType::Int, "torch.int64");
  maybeReplaceOutputType(node, output, tensor_type.get(),
                         at::ScalarType::BFloat16, at::ScalarType::Half,
                         "torch.bfloat16");
}
} // namespace

void checkAndChangeOutputTypes(torch::jit::Graph *graph) {
  logging::LogContext const ctx_func("CheckAndChangeOutputTypes");
  for (auto *n : graph->nodes()) {
    // Some unpacks will happen before the host side cast, so ignore them here
    if (isBeforeHostSideCast(n)) {
      continue;
    }

    logging::LogContext const ctx("processing " + nodeToString(n));

    for (auto *output : n->outputs()) {
      logging::LogContext const ctx_2(output->debugName());

      checkAndChangeOutputTypesForOutput(n, output);
    }
  }
}

} // namespace type_and_constant_canonicalization
} // namespace poptorch


================================================
FILE: poptorch/source/type_and_constant_canonicalization/EvaluateConstexprs.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.

#include <torch/csrc/jit/ir/ir.h>
#include <torch/csrc/jit/runtime/interpreter.h>

#include <algorithm>
#include <iterator>
#include <memory>
#include <unordered_map>
#include <unordered_set>
#include <vector>

#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

#include "../PoptorchSymbols.hpp"
#include "poptorch/OpBuilder.hpp"
#include "poptorch/TypeAndConstantCanonicalization.hpp"
#include "poptorch/Utils.hpp"

namespace poptorch {
namespace type_and_constant_canonicalization {
namespace {

size_t numNodesInGraph(const torch::jit::Graph *g) {
  return std::distance(g->nodes().begin(), g->nodes().end());
}

size_t numValuesInGraph(const torch::jit::Graph *g) {
  size_t num_values = 0;
  for (const auto *node : g->nodes()) {
    num_values += node->outputs().size();
  }
  return num_values;
}

const c10::Symbol exclude_node_attr = c10::Symbol::attr("exclude_node");

void markForExclusion(torch::jit::Node *node) {
  node->i_(exclude_node_attr, 1);
}

void recursivelyMarkInputsForExclusion(torch::jit::Node *node) {
  if (node->kind() == c10::prim::Param) {
    return;
  }

  if (node->hasAttribute(exclude_node_attr) && node->i(exclude_node_attr) > 0) {
    return;
  }

  markForExclusion(node);
  for (auto *input : node->inputs()) {
    recursivelyMarkInputsForExclusion(input->node());
  }
}

bool isMarkedForExclusion(torch::jit::Node *node) {
  return node->hasAttribute(exclude_node_attr) &&
         node->i(exclude_node_attr) > 0;
}

void unmarkForExclusion(torch::jit::Node *node) {
  node->removeAttribute(exclude_node_attr);
}

class ConstExprEvaluator {
public:
  explicit ConstExprEvaluator(torch::jit::Graph *g)
      : _graph(g), _nodes_map(numNodesInGraph(g)),
        _values_map(numValuesInGraph(g)) {}
  // Guarantees no re-hashing: does not matter if the hash map is sparse

  void evaluate();

private:
  void markSubgraphNodesForExclusion();

  void copyAllConstNodesToConstexprGraph();

  void removeExclusionAttributes();

  void addNodeInputArgUpcast(torch::jit::Node *new_node);

  void removeLoneConstants();

  void evaluateConstExprGraph(torch::jit::Stack *stack);

  void replaceWithConstants(const torch::jit::Stack &stack);

  void removeUnusedNodes();

  bool nodeIsConstExpr(const torch::jit::Node &node) const;

  void copyNodeToConstexprGraph(torch::jit::Node *node);

  static void setAmbiguousValuesToFloatOrHalf(torch::jit::Value *value);

  // Original graph
  torch::jit::Graph *_graph;

  // Graph containing constant expressions which can be evaluated
  std::shared_ptr<torch::jit::Graph> _constexpr_graph;

  // Map the nodes and inputs between the two graphs
  // original -> constexpr
  std::unordered_map<const torch::jit::Node *, torch::jit::Node *> _nodes_map;
  std::unordered_map<const torch::jit::Value *, torch::jit::Value *>
      _values_map;

  // Keep a list of the values in the original graph to be replaced by constants
  std::unordered_set<torch::jit::Value *> _ins_to_make_consts;
};

void ConstExprEvaluator::evaluate() {
  ERROR_ON_MSG(_constexpr_graph,
               "ConstExprEvaluator::evaluate should only be run once");
  _constexpr_graph = std::make_shared<torch::jit::Graph>();

  // Copy all nodes which can be evaluated as a constant expression into a new
  // graph with exception of subgraph nodes. In addition, set outputs of the
  // new graph where required
  markSubgraphNodesForExclusion();
  copyAllConstNodesToConstexprGraph();
  removeExclusionAttributes();

  // We do not want to evaluate lone constants only to replace them with an
  // identical constants
  removeLoneConstants();

  // Evaluate the constexpr graph saving the outputs to stack
  torch::jit::Stack stack;
  evaluateConstExprGraph(&stack);

  // Replace outputs in the original graph, with the constants calculated from
  // the constexpr graph
  replaceWithConstants(stack);

  // Remove nodes which are now unused, in the original graph
  removeUnusedNodes();
}

void ConstExprEvaluator::markSubgraphNodesForExclusion() {
  // Keep track of subgraphs to avoid evaluating constexprs that are part of
  // a subgraph.
  int num_unclosed_subgraphs = 0;
  for (auto *node : _graph->nodes()) {
    if (node->kind() == symbols::poptorch::start_for_loop ||
        node->kind() == symbols::poptorch::start_if_block ||
        node->kind() == symbols::poptorch::start_else_block) {
      num_unclosed_subgraphs++;
      // All nodes that eventually end up as subgraph inputs also need
      // to be excluded.
      recursivelyMarkInputsForExclusion(node->input()->node());
      continue;
    }
    if (node->kind() == symbols::poptorch::end_for_loop) {
      ERROR_ON(num_unclosed_subgraphs <= 0);
      num_unclosed_subgraphs--;
      continue;
    }
    if (node->kind() == symbols::poptorch::end_if_block) {
      ERROR_ON(num_unclosed_subgraphs <= 0);
      // if..else block stores 2 subgraphs, one for each branch.
      num_unclosed_subgraphs -= 2;
      continue;
    }
    if (num_unclosed_subgraphs > 0) {
      markForExclusion(node);
    }
  }
  ERROR_ON(num_unclosed_subgraphs != 0);
}

void ConstExprEvaluator::copyAllConstNodesToConstexprGraph() {
  logging::LogContext const ctx_func("ConstExprEvaluator");
  std::vector<torch::jit::Node *> nodes_plus_return;
  for (auto *node : _graph->nodes()) {
    nodes_plus_return.push_back(node);
  }
  nodes_plus_return.push_back(_graph->return_node());

  for (auto *node : nodes_plus_return) {
    logging::LogContext const ctx("processing " + nodeToString(node));

    if (!isMarkedForExclusion(node) && nodeIsConstExpr(*node)) {
      copyNodeToConstexprGraph(node);
    } else {
      for (auto *input : node->inputs()) {
        // Add any outputs to the const expression graph
        if (_values_map.count(input) == 1 &&
            _ins_to_make_consts.count(input) == 0) {
          _ins_to_make_consts.emplace(input);
          _constexpr_graph->registerOutput(_values_map[input]);
        }
      }
    }
  }
  logging::trace("Constexpr graph: {}", *_constexpr_graph);
}

void ConstExprEvaluator::removeExclusionAttributes() {
  for (auto *node : _graph->nodes()) {
    if (isMarkedForExclusion(node)) {
      unmarkForExclusion(node);
    }
  }
}

namespace {

std::optional<size_t> getUpcastIndexArg(torch::jit::Node *new_node) {
  const auto kind = new_node->kind();

  if (kind == c10::aten::scatter || kind == c10::aten::scatter_ ||
      kind == c10::aten::scatter_add || kind == c10::aten::scatter_add_ ||
      kind == c10::aten::scatter_reduce || kind == c10::aten::scatter_reduce_ ||
      kind == torch_scatter::scatter_max ||
      kind == torch_scatter::scatter_min ||
      kind == torch_scatter::scatter_mul) {
    return 2;
  }

  if (kind == c10::aten::take_along_dim) {
    return 1;
  }

  return std::nullopt;
}

void addInputUpcast(torch::jit::Graph *graph, torch::jit::Node *new_node,
                    std::size_t arg_index) {
  torch::jit::Value *input = new_node->input(arg_index);
  torch::jit::Node *cast =
      createAndInsertCastOp(graph, input, at::ScalarType::Long);
  new_node->replaceInputWith(input, cast->output());
}

} // namespace

void ConstExprEvaluator::addNodeInputArgUpcast(torch::jit::Node *new_node) {
  if (const auto index = getUpcastIndexArg(new_node); index) {
    addInputUpcast(_constexpr_graph.get(), new_node, index.value());
  }
}

void ConstExprEvaluator::removeLoneConstants() {
  for (auto *node : _graph->nodes()) {
    if (!node->inputs().empty()) {
      continue;
    }

    if (node->outputs().size() != 1) {
      continue;
    }

    if (_nodes_map.find(node) == _nodes_map.end()) {
      continue;
    }

    auto *new_node = _nodes_map[node];
    auto uses = new_node->output()->uses();
    if (uses.size() != 1) {
      continue;
    }

    if (uses[0].user != _constexpr_graph->return_node()) {
      continue;
    }

    // The node is on its own in the consextpr graph and there is no point
    // replacing it with another single node
    _constexpr_graph->eraseOutput(uses[0].offset);
    new_node->destroy();

    _nodes_map.erase(node);
    _values_map.erase(node->output());
    _ins_to_make_consts.erase(node->output());
  }
}

void ConstExprEvaluator::evaluateConstExprGraph(torch::jit::Stack *stack) {
  torch::jit::Code const code(_constexpr_graph, "");
  torch::jit::InterpreterState state(code);

  state.run(*stack);

  ERROR_ON(_ins_to_make_consts.size() != stack->size());
}

void ConstExprEvaluator::replaceWithConstants(const torch::jit::Stack &stack) {
  // Cache the mapping of output value to stack output index
  std::map<torch::jit::Value *, size_t> constexpr_value_to_out_idx;
  for (size_t idx = 0; idx < _constexpr_graph->outputs().size(); idx++) {
    constexpr_value_to_out_idx[_constexpr_graph->outputs()[idx]] = idx;
  }

  for (auto *value : _ins_to_make_consts) {
    // Find the matching stack output for the input from the constexpr
    auto *constexpr_value = _values_map[value];

    // Obtain the resolved value from the stack
    auto resolved_value = stack.at(constexpr_value_to_out_idx[constexpr_value]);

    if (resolved_value.isTensor()) {
      resolved_value = resolved_value.toTensor().contiguous();
    }

    // Insert a constant to replace the original node and replace all uses
    torch::jit::WithInsertPoint const insert_point(value->node());
    const WithNodeMetadata meta(value->node());
    torch::jit::Value *new_const = insertConstant(_graph, resolved_value);
    value->replaceAllUsesWith(new_const);
  }
}

bool ConstExprEvaluator::nodeIsConstExpr(const torch::jit::Node &node) const {
  // If a node has no outputs, it may be a sentinel
  if (node.outputs().empty()) {
    return false;
  }

  // update_param_inplace has an output but will fail on node.hasSideEffects()
  if (node.kind() == symbols::poptorch::update_param_inplace) {
    return false;
  }

  // Random nodes or nodes with side effects cannot be constants
  if (isNondeterministic(node) || node.hasSideEffects()) {
    return false;
  }

  // Either the node has no inputs, or all inputs are outputs of nodes already
  // copied to the constexpres_graph
  for (const auto *input : node.inputs()) {
    if (_values_map.count(input) == 0) {
      return false;
    }
  }

  return true;
}

void ConstExprEvaluator::removeUnusedNodes() {
  // Iterate in reverse so that each node has no users
  for (auto node_it = _graph->nodes().rbegin();
       node_it != _graph->nodes().end(); node_it++) {
    if (_nodes_map.count(*node_it) != 0u) {
      node_it.destroyCurrent();
    }
  }
}

void ConstExprEvaluator::copyNodeToConstexprGraph(torch::jit::Node *node) {
  auto *new_node = _constexpr_graph->createClone(
      node, [this](torch::jit::Value *v) { return this->_values_map[v]; },
      false);

  for (auto *input : new_node->inputs()) {
    auto maybe_device = input->type()->cast<c10::DeviceObjType>();
    if (maybe_device) {
      // All code should be running on CPU here
      input->node()->s_(c10::attr::value, "cpu");
    }
  }

  const WithNodeMetadata meta(new_node);

  addNodeInputArgUpcast(new_node);

  insertNodeInGraph(_constexpr_graph.get(), new_node);

  // The CPU backend in some case (e.g aten::expand) will alter a tensor's
  // strides, whereas the IPU will always keep all the tensors contiguous.
  // This means all reshapes can be lowered to view ops on the IPU, but
  // not necessarily on the CPU, so just to be safe we replace all view
  // ops by reshape ops in the const expr graph.
  if (new_node->kind() == c10::aten::view) {
    auto *view = new_node;
    new_node = view->replaceWithNewSymbol(c10::aten::reshape);
    view->destroy();
  }

  _nodes_map[node] = new_node;
  // Map the old outputs to the new
  const auto *old_it = node->outputs().begin();
  const auto *new_it = new_node->outputs().begin();
  for (; old_it != node->outputs().end(); old_it++, new_it++) {
    ERROR_ON(new_it == new_node->outputs().end());
    _values_map[*old_it] = *new_it;
  }
}

} // namespace

void evaluateConstexprs(torch::jit::Graph *graph) {
  ConstExprEvaluator evaluator(graph);
  evaluator.evaluate();
}

} // namespace type_and_constant_canonicalization
} // namespace poptorch


================================================
FILE: poptorch/source/type_and_constant_canonicalization/MakeConstantIntParams.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#include <torch/csrc/jit/ir/ir.h>

#include <algorithm>

#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

#include "poptorch/OpBuilder.hpp"
#include "poptorch/TypeAndConstantCanonicalization.hpp"
#include "poptorch/Utils.hpp"

namespace poptorch {
namespace type_and_constant_canonicalization {

void makeConstantIntParams(torch::jit::Graph *graph,
                           std::vector<std::string> &parameter_names,
                           std::vector<at::Tensor> &traced_parameter_tensors) {
  logging::LogContext ctx_func("makeConstantIntParams");
  // _parameters in Lower to popart is traced_parameter_tensors here.
  std::size_t num_inputs =
      graph->inputs().size() - traced_parameter_tensors.size();

  std::vector<std::size_t> to_delete;
  std::size_t index = 0;
  for (torch::jit::Value *value : graph->inputs()) {
    if (index < num_inputs) {
      index++;
      continue;
    }

    logging::LogContext ctx("processing " +
                            parameter_names[index - num_inputs]);

    // _parameters in Lower to popart is traced_parameter_tensors here.
    auto tensor = traced_parameter_tensors[index - num_inputs];

    if (value->type()->kind() == c10::TypeKind::TensorType) {
      auto tensor_type = value->type()->expect<c10::TensorType>();
      auto current_type = tensor_type->scalarType().value();

      if (!c10::isFloatingType(current_type)) {
        // Some nodes might not be used, we skip them if so.
        torch::jit::Node *earliest_user = findEarliestUser(value);
        if (earliest_user == nullptr) {
          continue;
        }

        torch::jit::WithInsertPoint insert_point(earliest_user);

        if (current_type == at::ScalarType::Long) {
          tensor = tensor.to(at::ScalarType::Int);
        }

        auto *new_node = tensorToConstant(graph, tensor);
        logging::trace("makeConstantIntParams removing graph input %{} and "
                       "adding constant value %{}",
                       value->debugName(), new_node->output()->debugName());

        for (size_t use_idx = 0; use_idx < value->uses().size(); use_idx++) {
          auto u = value->uses()[use_idx];
          u.user->replaceInput(u.offset, new_node->output());
          use_idx--;
        }

        ERROR_ON(!value->uses().empty());
        to_delete.push_back(index);
      }
    } else {
      // There is no known case of a parameter or buffer being a type other than
      // TensorType after tracing. Log a warning to assist debugging if a case
      // is found.
      logging::warn("Non tensor parameter/buffer identified: {}",
                    parameter_names[index - num_inputs]);
    }

    index++;
  }

  // Delete highest index first not to invalidate the later indices.
  ERROR_ON(!std::is_sorted(to_delete.begin(), to_delete.end()));

  for (auto it = to_delete.rbegin(); it != to_delete.rend(); ++it) {
    size_t del_i = *it;
    size_t del_i_params = del_i - num_inputs;

    parameter_names.erase(parameter_names.begin() + del_i_params);
    traced_parameter_tensors.erase(traced_parameter_tensors.begin() +
                                   del_i_params);
    graph->eraseInput(del_i);
  }
}

} // namespace type_and_constant_canonicalization
} // namespace poptorch


================================================
FILE: poptorch_compiler/pytorch_bridge/CMakeLists.txt
================================================
file(GLOB_RECURSE poptorch_compiler_public_headers "${CMAKE_CURRENT_SOURCE_DIR}/include/*.hpp*")

add_library(poptorch_compiler SHARED
  IpuSession.cpp
)

target_link_libraries(poptorch_compiler
  PRIVATE
    poptorch_logging
)

set_property(TARGET poptorch_compiler PROPERTY CXX_STANDARD 17)

set_target_properties(poptorch_compiler PROPERTIES
  PUBLIC_HEADER "${poptorch_compiler_public_headers}")

target_include_directories(poptorch_compiler
                            PUBLIC
                            $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
                            $<INSTALL_INTERFACE:include>
                            )
install(TARGETS poptorch_compiler
  LIBRARY
    DESTINATION ${CMAKE_INSTALL_LIBDIR}
  PUBLIC_HEADER
    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/pytorch_bridge
)


================================================
FILE: poptorch_compiler/pytorch_bridge/IpuSession.cpp
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
#include "pytorch_bridge/IpuSession.hpp"

#include <chrono>
#include <memory>
#include <thread>
#include <utility>

#include "pytorch_bridge/DebugInfo.hpp"

#include <poptorch_logging/Logging.hpp>

namespace poptorch_ir {

namespace {

std::size_t dataSize(Type element_type) {
  switch (element_type) {
  case Type::BOOL:
  case Type::CHAR:
  case Type::UNSIGNED_CHAR:
    return 1;
  case Type::SHORT:
  case Type::UNSIGNED_SHORT:
  case Type::HALF:
  case Type::BFLOAT16:
    return 2;
  case Type::INT:
  case Type::UNSIGNED_INT:
  case Type::FLOAT:
    return 4;
  case Type::NONE:
  case Type::UNDEFINED:
    break;
  }
  ERROR("No type");
}

class StaticIpuSession : public IIpuSession {
public:
  Buffer allocate(const TensorType &type) override {
    auto data_size = dataSize(type.element_type) * type.getNumElements();
    return Buffer(std::make_shared<std::vector<char>>(data_size));
  }
  void copyDataFromCpuSource(Buffer &ipu_dest, const char *cpu_data) override {
    const auto &ipu_data = ipu_dest.getCpuData();
    ERROR_ON(!ipu_data);
    std::copy(cpu_data, cpu_data + ipu_data->size(), ipu_data->data());
  }
  void copyDataToCpu(char *cpu_dest, Buffer &ipu_src) override {
    const auto &ipu_data = ipu_src.getCpuData();
    ERROR_ON(!ipu_data);
    std::copy(ipu_data->data(), ipu_data->data() + ipu_data->size(), cpu_dest);
  }
  void copyDataOnDevice(Buffer &dest, const Buffer &src) override {
    const auto &dest_data = dest.getCpuData();
    const auto &src_data = src.getCpuData();
    ERROR_ON(dest_data->size() != src_data->size());
    std::copy(src_data->data(), src_data->data() + src_data->size(),
              dest_data->data());
  }
};

} // namespace
Buffer::Buffer(CpuBuffer buf) noexcept : _store(std::move(buf)) {}
Buffer &Buffer::operator=(CpuBuffer buf) noexcept {
  _store = std::move(buf);
  return *this;
}
const CpuBuffer &Buffer::getCpuData() {
  ERROR_ON(!std::holds_alternative<CpuBuffer>(_store));
  return std::get<CpuBuffer>(_store);
}
const CpuBuffer &Buffer::getCpuData() const {
  ERROR_ON(!std::holds_alternative<CpuBuffer>(_store));
  return std::get<CpuBuffer>(_store);
}
bool Buffer::hasData() const {
  return !std::holds_alternative<std::monostate>(_store);
}

std::shared_ptr<IIpuSession> createStaticSession() {
  return std::make_shared<StaticIpuSession>();
}
} // namespace poptorch_ir


================================================
FILE: poptorch_compiler/pytorch_bridge/include/pytorch_bridge/CompilerOptions.hpp
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
#ifndef POPTORCH_COMPILER_PYTORCH_BRIDGE_COMPILER_OPTIONS_HPP_
#define POPTORCH_COMPILER_PYTORCH_BRIDGE_COMPILER_OPTIONS_HPP_

#include <vector>

namespace poptorch {

struct CompilerOptions {
  struct Dispatcher {
    // NOTE: std::string-s are avoided here due to ABI issues
    std::vector<std::vector<char>> source_location_excludes;
    bool check_added_ops = true;
  };
  Dispatcher dispatcher;
};

} // namespace poptorch

#endif


================================================
FILE: poptorch_compiler/pytorch_bridge/include/pytorch_bridge/CompilerTypes.hpp
================================================
// Copyright (c) 2021 Graphcore Ltd. All rights reserved.
#ifndef POPTORCH_COMPILER_PYTORCH_BRIDGE_COMPILER_TYPES_HPP_
#define POPTORCH_COMPILER_PYTORCH_BRIDGE_COMPILER_TYPES_HPP_

#include <cstdint>
#include <functional>
#include <limits>
#include <memory>
#include <numeric>
#include <vector>

namespace poptorch_ir {

// Host blob of memory containing data to transfer to the IPU.
using CpuBuffer = std::shared_ptr<std::vector<char>>;

// A token representing an SSA value on our side. PyTorch records it's
// tensors->TensorId and we record TensorId->mlir::Value. This stops either side
// from depending directly on each others internal representation.
using TensorId = std::uint32_t;

// This is identical except that it is known to be valid for it to be none_id
using OptionalTensorId = std::uint32_t;

// So we can signal that a tensor was invalid (Just for so unimplemented
// functions can return something right now.)
constexpr TensorId tensor_error_id = std::numeric_limits<TensorId>::max();

// The tensor is none (e.g. optional parameter/return) and this is not an error
constexpr TensorId none_id = std::numeric_limits<TensorId>::max() - 1;

// How to calculate which floating-point outputs require gradients (others
// types will always have this set to false.)
enum class RequiresGradType {
  OR_INPUTS, // OR together all the input tensor requires_grad values
  FALSE      // always false
};

struct ODSTensorResult {
  std::vector<TensorId> tensor_ids;
  std::vector<RequiresGradType> requires_grad_types;
};

// When returning an MLIR op, each return could be compulsory, optional or
// variadic tensor under the MLIR Operation Definition Specification (ODS).
// Using a vector for each return allows each return to be optional or variadic.
using ODSTensorResults = std::vector<ODSTensorResult>;

enum class Type : std::uint8_t {
  BOOL,
  CHAR,
  UNSIGNED_CHAR,
  SHORT,
  UNSIGNED_SHORT,
  INT,
  UNSIGNED_INT,
  HALF,
  FLOAT,
  BFLOAT16,
  NONE,
  UNDEFINED,
};

struct TensorType {
  std::vector<int64_t> shape;
  Type element_type;

  std::int64_t getNumElements() const {
    return std::accumulate(shape.begin(), shape.end(), std::int64_t{1},
                           std::multiplies<>());
  }
};

struct StreamInfo {
  std::vector<char> name;
  CpuBuffer buff;

  TensorType type;

  std::string_view nameStringView() const {
    return std::string_view(name.data(), name.size());
  }
};

} // namespace poptorch_ir

#endif


================================================
FILE: poptorch_compiler/pytorch_bridge/include/pytorch_bridge/DebugInfo.hpp
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
#ifndef POPTORCH_COMPILER_PYTORCH_BRIDGE_DEBUG_INFO_HPP_
#define POPTORCH_COMPILER_PYTORCH_BRIDGE_DEBUG_INFO_HPP_

#include <memory>
#include <vector>

namespace poptorch_ir {

struct GraphDebugInfo {
  // Note these are shared with the tensor details
  std::shared_ptr<std::vector<char>> initial_graph;
  std::shared_ptr<std::vector<char>> cached_graph;
};

struct TensorDebugInfo {
  GraphDebugInfo debug_info;
  std::size_t output_idx = 0;
};

} // namespace poptorch_ir

#endif // POPTORCH_COMPILER_PYTORCH_BRIDGE_DEBUG_INFO_HPP_


================================================
FILE: poptorch_compiler/pytorch_bridge/include/pytorch_bridge/IpuSession.hpp
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
#ifndef POPTORCH_COMPILER_PYTORCH_BRIDGE_IPU_SESSION_HPP_
#define POPTORCH_COMPILER_PYTORCH_BRIDGE_IPU_SESSION_HPP_

#include <iterator>
#include <memory>
#include <variant>
#include <vector>

#include "pytorch_bridge/CompilerTypes.hpp"
#include "pytorch_bridge/DebugInfo.hpp"
#include <poptorch_logging/Error.hpp>

namespace poptorch_ir {

struct FunctionIO {
  std::vector<TensorId> inputs;
  std::vector<TensorId> outputs;
};

class Buffer {
  // TODO(T70841): since Buffer is stored as a shared pointer it should be
  // possible at least stop CpuBuffer being a shared pointer.
  std::variant<std::monostate, CpuBuffer> _store = std::monostate{};

public:
  Buffer() = default;
  explicit Buffer(CpuBuffer buf) noexcept;

  Buffer &operator=(CpuBuffer buf) noexcept;

  const CpuBuffer &getCpuData();
  const CpuBuffer &getCpuData() const;

  bool hasData() const;
};

class IIpuSession {
public:
  virtual ~IIpuSession() = default;

  virtual Buffer allocate(const TensorType &type) = 0;
  virtual void copyDataFromCpuSource(Buffer &ipu_dest, const char *cpu_src) = 0;
  virtual void copyDataToCpu(char *cpu_dest, Buffer &ipu_src) = 0;
  virtual void copyDataOnDevice(Buffer &dest, const Buffer &src) = 0;
};

std::shared_ptr<IIpuSession> createStaticSession();

} // namespace poptorch_ir

#endif // POPTORCH_COMPILER_PYTORCH_BRIDGE_IPU_SESSION_HPP_


================================================
FILE: poptorch_err/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
project(poptorch_err)

set(CMAKE_POSITION_INDEPENDENT_CODE ON)

add_library(poptorch_exception_info INTERFACE)

target_include_directories(poptorch_exception_info
                           INTERFACE
                           exception_info)

add_library(poptorch_err STATIC
  "source/ExceptionHandling.cpp")

target_include_directories(poptorch_err SYSTEM PUBLIC
                           $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
                           $<INSTALL_INTERFACE:include>
                                        )
file(GLOB_RECURSE poptorch_err_public_headers "${CMAKE_CURRENT_SOURCE_DIR}/include/*.hpp*" "exception_info/*.hpp*")

set_target_properties(poptorch_err PROPERTIES
  PUBLIC_HEADER "${poptorch_err_public_headers}")
target_link_libraries(poptorch_err
                      PUBLIC
                      torch
                      poptorch_exception_info
                      PRIVATE
                      popart_compiler
                      poptorch_logging)

install(TARGETS poptorch_err
  LIBRARY
    DESTINATION ${CMAKE_INSTALL_LIBDIR}
  PUBLIC_HEADER
    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/poptorch_err
  )


================================================
FILE: poptorch_err/exception_info/poptorch_err/ExceptionInfo.hpp
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
#pragma once

#include <cstdint>

namespace poptorch {

enum class ErrorCategory { RuntimeRecoverable, RuntimeUnrecoverable, Other };

/*
 * A subclass of this class is used to pass exception information across the ABI
 * boundary between popart_compiler and the pybind11 interface. It has to use
 * POD data types to cross the boundary successfully. We then unpack it into a
 * PoptorchError on the pybind11 side and rethrow it.
 */
class ExceptionInfo {
public:
  virtual ~ExceptionInfo();
  const virtual char *what() const = 0;
  const virtual char *type() const = 0;
  virtual int64_t stackDepth() const = 0;
  const virtual char *stack(int64_t level) const = 0;
  const virtual char *filename() const = 0;
  virtual uint64_t line() const = 0;
  const virtual char *recoveryAction() const = 0;
  virtual ErrorCategory category() const = 0;
};

} // namespace poptorch


================================================
FILE: poptorch_err/include/poptorch_err/ExceptionHandling.hpp
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
#pragma once
#include <exception>
#include <functional>
#include <string>
#include <vector>

#include "poptorch_err/ExceptionInfo.hpp"

namespace poptorch {
/*
 * The function convertToPoptorchExceptionOrRethrow() processes all the
 * exception types we're interested in, extracts detail, and marshals them as
 * instances of PoptorchErrorInfo. The exceptions we're not interested in are
 * re-thrown as-is.
 */
struct PoptorchErrorInfo {
public:
  ErrorCategory category;
  std::string filename;
  uint64_t line;
  std::string type;
  std::string recovery_action;
  std::string message;
  std::string long_message;
  std::vector<std::string> stack;
  std::string location;
};

PoptorchErrorInfo
convertToPoptorchExceptionOrRethrow(const std::exception_ptr &e, bool catch_all,
                                    const std::string &catch_file,
                                    uint64_t catch_line);

} // namespace poptorch

/*
 * This template wraps a function in our try..catch block. It's done this way
 * so it's less likely that someone will add an entry point without wrapping
 * it in a try..catch block - the path of least resistance is to copy-paste
 * the pybind11 def() line which will include the PTC() macro.
 * This doesn't work for class member functions wrapped by pybind11, which have
 * to be manually wrapped in a try-catch block.
 */
template <void (*g)(const poptorch::PoptorchErrorInfo &), bool catch_all,
          class F, F f>
struct PoptorchCatchWrapperImpl;
template <void (*g)(const poptorch::PoptorchErrorInfo &), bool catch_all,
          class R, class... Args, R (*f)(Args...)>
struct PoptorchCatchWrapperImpl<g, catch_all, R (*)(Args...), f> {
  static R wrap(Args... args) {
    try {
      return f(args...);
    } catch (...) {
      // TODO(T71675): find a way to pass catch_file / catch_line
      g(poptorch::convertToPoptorchExceptionOrRethrow(std::current_exception(),
                                                      catch_all, "unknown", 0));
    }
  }
};


================================================
FILE: poptorch_err/source/ExceptionHandling.cpp
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
#include "poptorch_err/ExceptionHandling.hpp"

#include <fstream>
#include <memory>

#include "popart_compiler/Compiler.hpp"

#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/Logging.hpp"

#define ERROR_LOG "poptorch_error.log"

namespace poptorch {

ExceptionInfo::~ExceptionInfo() {}

static const int max_log_line_length = 80;

PoptorchErrorInfo convertToPoptorchExceptionOrRethrow(
    const std::exception_ptr &eptr, bool catch_all,
    const std::string &catch_file, uint64_t catch_line) {
  ErrorCategory category = ErrorCategory::Other;
  std::string filename;
  uint64_t line;
  std::string type;
  std::string recovery_action;
  std::string message;
  std::vector<std::string> stack;
  std::string location;

  filename = catch_file;
  line = catch_line;

  try {
    popart_compiler::rethrowPopartOrPoplarException(eptr, catch_file.c_str(),
                                                    catch_line);
    std::rethrow_exception(eptr);
  } catch (const ExceptionInfo &ei) {
    filename = ei.filename();
    line = ei.line();
    category = ei.category();
    type = ei.type();
    message = ei.what();
    for (int i = 0; i < ei.stackDepth(); i++) {
      stack.emplace_back(ei.stack(i));
    }
    recovery_action = ei.recoveryAction();
  } catch (const poptorch::logging::Error &ex) {
    logging::trace("Full error: {}", ex.what());
    message = ex.what();
    type = "poptorch_cpp_error";
    filename = ex.file();
    line = ex.line();
    message = ex.message();
  } catch (const std::out_of_range &ex) {
    message = ex.what();
    type = "std::out_of_range";
  } catch (const std::exception &ex) {
    if (!catch_all) {
      throw;
    }
    message = ex.what();
    type = "std::exception";
  }

  if (std::count(std::begin(message), std::end(message), '\n') >
      max_log_line_length) {
    std::ofstream log;
    log.open(ERROR_LOG);
    log << message;
    log.close();
    message = "See " ERROR_LOG " for details";
  }

  std::stringstream swhat;
  swhat << "In " << filename << ":" << line << ": '" << type
        << "': " << message;
  if (category == ErrorCategory::RuntimeRecoverable) {
    swhat << "\nRecovery action required: " << recovery_action;
  }
  auto ctx = poptorch::logging::LogContext::context();
  if (ctx) {
    location = ctx.get();
    if (!location.empty()) {
      swhat << "\nError raised in:\n" << location;
    }
  }
  PoptorchErrorInfo pe;
  pe.long_message = swhat.str();
  pe.category = category;
  pe.filename = filename;
  pe.line = line;
  pe.type = type;
  pe.recovery_action = recovery_action;
  pe.message = message;
  pe.stack = stack;
  pe.location = location;

  poptorch::logging::LogContext::resetContext();

  return pe;
}

} // namespace poptorch


================================================
FILE: poptorch_geometric/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
project(poptorch-geometric)

set(INSTALL_POPPYG_PYDIR ${CMAKE_INSTALL_PREFIX}/poptorch_geometric)

add_subdirectory(python)

add_custom_target(poptorch_geometric_wheel
  WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}
  COMMAND python3 ${PROJECT_SOURCE_DIR}/../scripts/generate_poppyg_package.py bdist_wheel --output-dir ${CMAKE_INSTALL_PREFIX}/dist --python-dir ${INSTALL_POPPYG_PYDIR}
)

add_custom_target(poptorch_geometric_sdist
  WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}
  COMMAND python3 ${PROJECT_SOURCE_DIR}/../scripts/generate_poppyg_package.py sdist --output-dir ${CMAKE_INSTALL_PREFIX}/dist --python-dir ${INSTALL_POPPYG_PYDIR}
)

install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/README.md
        DESTINATION .)


================================================
FILE: poptorch_geometric/License.txt
================================================
The MIT License (MIT)

Copyright (c) 2023 Graphcore Limited

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


================================================
FILE: poptorch_geometric/MANIFEST.in
================================================
include *.py
include *.toml
include License.txt


================================================
FILE: poptorch_geometric/README.md
================================================
# poptorch-geometric
Set of extensions for PyTorch Geometric, enabling GNN models to be trained, evaluated and used on the Graphcore IPU.

:warning: This project is under active development. All APIs should be considered volatile and any feedback is welcome.


================================================
FILE: poptorch_geometric/config.buildenv.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.

installers.add(PipRequirements("requirements.txt"))


================================================
FILE: poptorch_geometric/poptorch_geometric_third_party_licenses.txt
================================================
The PopTorch Geometric package includes code from the following third party projects:

PyTorch Geometric
-----------------
Copyright (c) 2023 PyG Team <team@pyg.org>

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

PyTorch Cluster
---------------
Copyright (c) 2020 Matthias Fey <matthias.fey@tu-dortmund.de>

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.


================================================
FILE: poptorch_geometric/pyproject.toml
================================================
[build-system]
requires = [
  "setuptools>=42",
  "wheel"
]
build-backend = "setuptools.build_meta"


================================================
FILE: poptorch_geometric/python/CMakeLists.txt
================================================
include(GNUInstallDirs)
file(GLOB poppyg_python_files "${CMAKE_CURRENT_SOURCE_DIR}/*.py")

# __init__.py needs to be edited by set_version.py so don't copy it over.
list(REMOVE_ITEM poppyg_python_files "${CMAKE_CURRENT_SOURCE_DIR}/__init__.py")

install(CODE
   " execute_process(
        COMMAND python3 ${PROJECT_SOURCE_DIR}/../scripts/set_version.py --input-file ${CMAKE_CURRENT_SOURCE_DIR}/__init__.py ${CMAKE_CURRENT_BINARY_DIR}/__init__.py
        WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
        RESULT_VARIABLE RETVAL OUTPUT_VARIABLE OUTPUT ERROR_VARIABLE OUTPUT)
    if(RETVAL AND NOT RETVAL EQUAL 0)
      message(FATAL_ERROR \"set_version.py FAILED: \${OUTPUT}\")
    endif()")

install(FILES ${CMAKE_CURRENT_BINARY_DIR}/__init__.py DESTINATION "${INSTALL_POPPYG_PYDIR}")
install(FILES ${poppyg_python_files} py.typed DESTINATION "${INSTALL_POPPYG_PYDIR}")
install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/ops DESTINATION "${INSTALL_POPPYG_PYDIR}")

install(CODE
   " execute_process(
        COMMAND python3 ${PROJECT_SOURCE_DIR}/../scripts/generate_poppyg_package.py install --output-dir ${CMAKE_INSTALL_PREFIX} --python-dir ${INSTALL_POPPYG_PYDIR}
        WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
        RESULT_VARIABLE RETVAL OUTPUT_VARIABLE OUTPUT ERROR_VARIABLE OUTPUT)
    if(RETVAL AND NOT RETVAL EQUAL 0)
      message(FATAL_ERROR \"generate_poppyg_package.py FAILED: \${OUTPUT}\")
    endif()")


================================================
FILE: poptorch_geometric/python/__init__.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import importlib

from .collate import make_exclude_keys
from .common import call_once
from .dataloader import (FixedSizeDataLoader, FixedSizeStrategy,
                         OverSizeStrategy)
from .fixed_size_options import FixedSizeOptions
from .types import PyGArgsParser, registerCustomArgParsers
from .utils import TrainingStepper, set_aggregation_dim_size
from .override import _TorchGeometricOpsSubstitutionManager

__version__ = "@VERSION@-@SNAPSHOT@"

__all__ = [
    '__version__', 'FixedSizeDataLoader', 'FixedSizeOptions',
    'FixedSizeStrategy', 'set_aggregation_dim_size', 'TrainingStepper',
    'make_exclude_keys', 'OverSizeStrategy', 'PyGArgsParser'
]


@call_once
def registerOverrideManager():
    poplar_executor_spec = importlib.util.find_spec(
        "poptorch._poplar_executor")
    if poplar_executor_spec is not None:
        loader = poplar_executor_spec.loader
        if loader is not None:
            poplar_executor = loader.load_module()
            poplar_executor._OverwriteContextManager.registerSubsitutionManager(  # pylint: disable=protected-access
                _TorchGeometricOpsSubstitutionManager)


registerOverrideManager()
registerCustomArgParsers()


================================================
FILE: poptorch_geometric/python/cluster_loader.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
from __future__ import annotations  # noqa: F407

from typing import Optional

from torch_geometric.loader import ClusterData

from poptorch_geometric.collate import CombinedBatchingCollater
from poptorch_geometric.fixed_size_options import FixedSizeOptions
from poptorch_geometric.pyg_cluster_loader import \
    FixedSizeClusterLoader as PyGFixedSizeClusterLoader
from poptorch_geometric.pyg_dataloader import OverSizeStrategy
import poptorch


class FixedSizeClusterLoader(PyGFixedSizeClusterLoader, poptorch.DataLoader):
    r"""A data loader which merges data objects from a
    :py:class:`torch_geometric.loader.ClusterData` to a mini-batch of clusters
    and pads node and edge features so tensors across all batches have constant
    shapes.

    Args:
        cluster_data (ClusterData): The cluster from which to load the data.
        fixed_size_options (FixedSizeOptions, optional): A
            :py:class:`poptorch_geometric.fixed_size_options.FixedSizeOptions`
            object which holds the maximum number of nodes, edges and other
            options required to pad the mini-batches, produced by the data
            loader, to a fixed size.
        batch_size (int, optional): The number of nodes per mini-batch to
            load.
            (default: :obj:`1`)
        over_size_strategy (OverSizeStrategy, optional): The
            behaviour if a sample cannot fit in the fixed-size mini-batch.
            By default, if the required number of samples cannot fit into the
            fixed-sized mini-batch, nodes and edges will be removed from the
            mini-batch to achieve the specified fixed size.
            (default: `poptorch_geometric.OverSizeStrategy.TrimNodesAndEdges`)
        add_pad_masks  (bool, optional): If :obj:`True`, mask objects
            are attached to mini-batch result. They represents three levels of
            padding:

            - :obj:`graphs_mask` - graph level mask
            - :obj:`nodes_mask`  - node level mask
            - :obj:`edges_mask`  - edge level mask

            Mask objects indicate which elements in the mini-batch are real
            (represented by :obj:`True`) and which were added as
            padding (represented by :obj:`False`).
            (default: :obj:`True`)
        options (poptorch.Options, optional): The additional PopTorch options
            to be passed to :py:class:`poptorch.DataLoader`.
            (default: :obj:`None`)
        **kwargs (optional): The additional arguments of
            :py:class:`poptorch.DataLoader`.
    """

    def __init__(
            self,
            cluster_data: ClusterData,
            fixed_size_options: FixedSizeOptions,
            batch_size: int = 1,
            over_size_strategy: OverSizeStrategy = OverSizeStrategy.
            TrimNodesAndEdges,
            add_pad_masks: Optional[bool] = True,
            options: Optional[poptorch.Options] = None,
            **kwargs,
    ):
        self.batch_size = batch_size

        if options is None:
            # Create IPU default options
            options = poptorch.Options()

        super().__init__(cluster_data=cluster_data,
                         fixed_size_options=fixed_size_options,
                         batch_size=batch_size,
                         over_size_strategy=over_size_strategy,
                         add_pad_masks=add_pad_masks,
                         options=options,
                         **kwargs)

    def _create_collater(self, **collater_args):
        collater = super()._create_collater(**collater_args)
        return CombinedBatchingCollater(mini_batch_size=self.batch_size,
                                        collater=collater)


================================================
FILE: poptorch_geometric/python/collate.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.

from enum import Enum
from typing import Any, Dict, Generator, List, Optional, Tuple, Union
from functools import singledispatch
try:
    from functools import singledispatchmethod
except ImportError:
    from singledispatchmethod import singledispatchmethod
from itertools import chain

import torch
from torch_geometric.data import Batch, Data, HeteroData
from torch_geometric.data.data import BaseData
from torch_geometric.typing import EdgeType, NodeType
from torch_geometric.transforms import Pad

from poptorch_geometric.fixed_size_options import FixedSizeOptions
from poptorch_geometric.pyg_collate import Collater
from poptorch_geometric.common import DataBatch, HeteroDataBatch

from poptorch._utils import combine_batch_tensors_gen

from . import types

__all__ = ['FixedSizeCollater', 'CombinedBatchingCollater']


def make_exclude_keys(include_keys: Union[List[str], Tuple[str, ...]],
                      data: BaseData) -> Tuple[str, ...]:
    return tuple(set(data.keys) - set(include_keys))


def _divide_evenly_formula(amount: int, pieces: int) -> List[int]:
    minimum = amount // pieces
    extra = amount - minimum * pieces
    return [minimum + (1 if i < extra else 0) for i in range(pieces)]


@singledispatch
def _divide_evenly(data, num_pad_graphs, num_pad_nodes, num_pad_edges):  # pylint: disable=unused-argument
    raise ValueError(f'Unsupported data type: {type(data)}')


@_divide_evenly.register(Data)
def _(_, num_pad_graphs: int, num_pad_nodes: int,
      num_pad_edges: int) -> Tuple[List[int], List[int]]:
    return _divide_evenly_formula(num_pad_nodes,
                                  num_pad_graphs), _divide_evenly_formula(
                                      num_pad_edges, num_pad_graphs)


@_divide_evenly.register(HeteroData)
def _(_, num_pad_graphs: int, num_pad_nodes: Dict[NodeType, int],
      num_pad_edges: Dict[EdgeType, int]
      ) -> Tuple[List[Dict[NodeType, int]], List[Dict[EdgeType, int]]]:
    def calc_pads(num_pad_elems):
        pad_elems = [dict() for i in range(num_pad_graphs)]
        for type_, pad_val in num_pad_elems.items():
            pad_per_graph = _divide_evenly_formula(pad_val, num_pad_graphs)
            for graph_idx, graph_pad in enumerate(pad_per_graph):
                pad_elems[graph_idx][type_] = graph_pad
        return pad_elems

    pad_nodes = calc_pads(num_pad_nodes)
    pad_edges = calc_pads(num_pad_edges)
    return pad_nodes, pad_edges


@singledispatch
def _generate_data_to_pad(data_to_pad_dict):
    raise ValueError(f'Unsupported data type: {type(data_to_pad_dict)}')


@_generate_data_to_pad.register(Data)
def _(data_to_pad_dict: dict) -> Data:
    return Data.from_dict(data_to_pad_dict)


@_generate_data_to_pad.register(HeteroData)
def _(data_to_pad_dict: dict) -> HeteroData:
    return HeteroData(data_to_pad_dict)


def _reset_dim(shape: torch.Size, key: str = None) -> List[int]:
    shape = list(shape)
    if len(shape) > 1:
        shape[1 if key == 'edge_index' else 0] = 0
    else:
        return list([0])
    return shape


def _reset_attr(value: Any, key: str = None) -> Any:
    """Reset value to the default of its type. In case of torch.Tensor, it
    returns a tensor with one of the dims set to 0. The dim is
    determined based on the key.
    """
    if isinstance(value, torch.Tensor):
        # NOTE: It has to be torch.zeros - creating a Tensor directly
        # (through torch.tensor) with 0 in shape ends up in creating a
        # tensor with wrong dimensions.
        return torch.zeros(_reset_dim(value.shape, key), dtype=value.dtype)
    return type(value)()


def _create_preserve_mask(num_elems: int, num_elems_to_trim: int,
                          slices: List[slice]) -> List[bool]:
    # Prevent deletion of all elements from a single graph.
    removable_nodes_mask = torch.ones(num_elems, dtype=torch.bool)
    for data_slice in slices:
        if data_slice.start < data_slice.stop:
            mask_slice = removable_nodes_mask[data_slice]
            mask_slice[torch.randint(high=len(mask_slice), size=(1, ))] = False
    indices = torch.arange(0, num_elems)[removable_nodes_mask]

    # Randomly select elements to remove.
    prune_indices = indices[torch.randperm(
        len(indices))][:num_elems_to_trim].type(torch.long)
    preserve_mask = torch.ones(num_elems, dtype=torch.bool)
    preserve_mask[prune_indices] = False

    return preserve_mask


def data_slice_gen(num_list: List[int]) -> Generator[slice, None, None]:
    start = 0
    end = 0
    for num in num_list:
        end += num
        yield slice(start, end)
        start = end


def create_slices_and_preserve_mask(
        max_num: int, num_list: List[int]
) -> Tuple[Generator[slice, None, None], List[bool]]:
    num_real = sum(num_list)

    # There is nothing to prune.
    if num_real < max_num:
        return None, None

    num_to_trim = num_real - max_num

    slices = list(data_slice_gen(num_list))

    # Prepare the mask of randomly chosen to remove.
    preserve_mask = _create_preserve_mask(num_real, num_to_trim, slices)

    return slices, preserve_mask


@singledispatch
def _any_negative(value: int) -> bool:
    return value < 0


@_any_negative.register(dict)
def _(value: dict) -> bool:
    return any(v < 0 for v in value.values())


@singledispatch
def _any_positive(value: int) -> bool:
    return value > 0


@_any_positive.register(dict)
def _(value: dict) -> bool:
    return any(v > 0 for v in value.values())


@singledispatch
def _check_if_over_size(num_pad: int, num_total: int, type_str: str,
                        oversize_error: str):
    if _any_negative(num_pad):
        raise RuntimeError(
            oversize_error.format(type_str=type_str,
                                  trim_fn=f"trim_{type_str}",
                                  type_value=num_total))


@_check_if_over_size.register(dict)
def _(num_pad: dict, num_total: dict, type_str: str, oversize_error: str):
    for k, v in num_pad.items():
        if v < 0:
            raise RuntimeError(
                oversize_error.format(type_str=f"{k} {type_str}",
                                      trim_fn=f"trim_{type_str}",
                                      type_value=num_total[k]))


class FixedSizeCollater(Collater):
    r"""Collates a batch of graphs as a
    :py:class:`torch_geometric.data.Batch` of fixed-size tensors.

    Calling an instance of this class adds an additional graphs with the
    necessary number of nodes and edges to pad the batch so that tensors have
    the size corresponding to the maximum numbers of graphs, nodes and edges
    specified during initialisation.

    Calling an instance of this class can result in :py:exc:`RuntimeError` if
    the number of graphs (if set), nodes or edges in the batch is larger than
    the requested limits.

    Args:
        fixed_size_options (FixedSizeOptions, optional): A
            :py:class:`poptorch_geometric.fixed_size_options.FixedSizeOptions`
            object which holds the maximum number of nodes, edges and other
            options required to pad the batches, produced by collater,
            to a fixed size.
        add_masks_to_batch (bool, optional): If set to :obj:`True`, masks object
            are attached to batch result. They represents three levels of
            padding:

            - :obj:`graphs_mask` - graph level mask
            - :obj:`nodes_mask`  - node level mask
            - :obj:`edges_mask`  - edge level mask

            Mask objects indicates which elements in the batch are real
            (represented by :obj:`True` value) and which were added as a padding
            (represented by :obj:`False` value). (default: :obj:`False`)
        trim_nodes (bool, optional): If set to :obj:`True`, randomly prune
            nodes from batch to fulfill the condition of :obj:`num_nodes`.
            (default: :obj:`False`)
        trim_edges (bool, optional): If set to :obj:`True`, randomly prune
            edges from batch to fulfill the condition of :obj:`num_edges`.
            (default: :obj:`False`)
        follow_batch (list or tuple, optional): Creates assignment batch
            vectors for each key in the list. (default: :obj:`None`)
        exclude_keys (list or tuple, optional): The keys to exclude
            from the graphs in the output batch. (default: :obj:`None`)
    """

    def __init__(
            self,
            fixed_size_options: FixedSizeOptions,
            add_masks_to_batch: Optional[bool] = False,
            trim_nodes: Optional[bool] = False,
            trim_edges: Optional[bool] = False,
            follow_batch: Optional[Union[List[str], Tuple[str, ...]]] = None,
            exclude_keys: Optional[Union[List[str], Tuple[str, ...]]] = None,
    ) -> None:
        super().__init__(follow_batch, exclude_keys)
        self.opts = fixed_size_options
        self.add_masks_to_batch = add_masks_to_batch
        self.trim_nodes = trim_nodes
        self.trim_edges = trim_edges
        self.labels_type = None

    class LabelsType(Enum):
        GRAPH_LVL = 0
        NODE_LVL = 1

    def __call__(self, data_list: List[BaseData]) -> Batch:
        if not self.opts.is_hetero() and isinstance(data_list[0], HeteroData):
            self.opts.to_hetero(data_list[0].node_types,
                                data_list[0].edge_types)

        if not isinstance(data_list, list):
            raise TypeError(f'Expected list, got {type(data_list).__name__}.')

        if (isinstance(data_list[0], Data) and hasattr(data_list[0], 'y')
                and data_list[0].y is not None):
            y0_equal_num_nodes = all(data.y.shape[0] == data.num_nodes
                                     for data in data_list)
            y0_equal_ones = all(data.y.shape[0] == 1 for data in data_list)

            if y0_equal_num_nodes and not y0_equal_ones:
                self.labels_type = self.LabelsType.NODE_LVL
            elif y0_equal_ones and not y0_equal_num_nodes:
                self.labels_type = self.LabelsType.GRAPH_LVL
            else:
                assert False, "Incorrect input data. The size of the shape" \
                              "of labels `y` must be either the number" \
                              "of nodes or the number of graphs"

        num_real_graphs = len(data_list)
        num_pad_graphs = self.opts.num_graphs - num_real_graphs

        if num_pad_graphs < 0:
            raise RuntimeError(
                "The maximum number of graphs requested doesn't allocate"
                " enough room for all the graphs in the batch plus at least"
                " one extra graph required for padding the batch to a fixed"
                " size. The number of graphs received for batching is"
                f" {num_real_graphs + 1}, including at least one padding"
                " graph, but space for only"
                f" {num_pad_graphs + num_real_graphs} graphs has been"
                " requested.")
        num_all_graphs = num_real_graphs + num_pad_graphs

        num_real_nodes, num_pad_nodes, num_real_edges, num_pad_edges = \
            self._calc_pad_limits(data_list)
        if self.trim_nodes and _any_negative(num_pad_nodes):
            data_list = self._prune_nodes(data_list)
            num_real_nodes, num_pad_nodes, num_real_edges, num_pad_edges = \
                self._calc_pad_limits(data_list)

        if self.trim_edges and _any_negative(num_pad_edges):
            data_list = self._prune_edges(data_list)
            num_real_nodes, num_pad_nodes, num_real_edges, num_pad_edges = \
                self._calc_pad_limits(data_list)

        oversize_error = (
            "The fixed sizes given don't allocate enough space for the"
            " number of {type_str} required to fit"
            f" {num_real_graphs} sample(s) into a batch"
            f" ({num_pad_graphs + num_real_graphs} including extra padded"
            " graph(s)). Increase the maximum number of {type_str}, currently"
            " set to {type_value}, or set `{trim_fn}` to remove any"
            " excess {type_str} to achieve the given maximum number of"
            " {type_str}.")

        _check_if_over_size(num_pad_nodes, self.opts.num_nodes, "nodes",
                            oversize_error)
        _check_if_over_size(num_pad_edges, self.opts.num_edges, "edges",
                            oversize_error)

        num_nodes_or_edges_positive = _any_positive(
            num_pad_nodes) or _any_positive(num_pad_edges)
        if num_pad_graphs == 0 and num_nodes_or_edges_positive:
            raise RuntimeError(
                f'Requested to pad a batch to {num_all_graphs} graphs but ' \
                f'collater got a list of {num_real_graphs} graphs and ' \
                'cannot create additional graphs to pad nodes and edges.')

        if num_pad_graphs and num_nodes_or_edges_positive:
            data = data_list[0]
            # Divide padding nodes and edges evenly between padding graphs.
            pad_nodes_by_graph, pad_edges_by_graph = _divide_evenly(
                data, num_pad_graphs, num_pad_nodes, num_pad_edges)

            data_to_pad_dict = self._create_structure_dict(data)
            for nodes, edges in zip(pad_nodes_by_graph, pad_edges_by_graph):
                padded_data = self._create_padded_data(data_list,
                                                       data_to_pad_dict, nodes,
                                                       edges)
                data_list.append(padded_data)

        batch = super().__call__(data_list)
        if self.add_masks_to_batch:
            padded_data_list = data_list[-num_pad_graphs:]
            self._add_masks(batch,
                            num_all_graphs,
                            num_real_graphs,
                            num_real_nodes=num_real_nodes,
                            num_real_edges=num_real_edges,
                            padded_data_list=padded_data_list)

        return batch

    @singledispatchmethod
    def _add_masks(self, batch, num_all_graphs, num_real_graphs, **kwargs):
        raise ValueError(f'Unsupported data type: {type(batch)}')

    @_add_masks.register(DataBatch)
    def _(self, batch: DataBatch, num_all_graphs: int, num_real_graphs: int,
          **kwargs) -> None:  # num_real_nodes: int, num_real_edges: int
        num_real_nodes = kwargs['num_real_nodes']
        num_real_edges = kwargs['num_real_edges']
        graphs_mask = torch.arange(num_all_graphs) < num_real_graphs
        nodes_mask = torch.arange(self.opts.num_nodes) < num_real_nodes
        edges_mask = torch.arange(self.opts.num_edges) < num_real_edges
        setattr(batch, 'graphs_mask', graphs_mask)
        setattr(batch, 'nodes_mask', nodes_mask)
        setattr(batch, 'edges_mask', edges_mask)

    @_add_masks.register(HeteroDataBatch)
    def _(self, batch: HeteroDataBatch, num_all_graphs: int,
          num_real_graphs: int,
          **kwargs) -> None:  # padded_data_list: List[HeteroDataBatch]):
        padded_data_list = kwargs['padded_data_list']
        graphs_mask = torch.arange(num_all_graphs) < num_real_graphs
        setattr(batch, 'graphs_mask', graphs_mask)

        num_padded_nodes_list = [0] * len(batch.node_stores)
        num_padded_edges_list = [0] * len(batch.edge_stores)
        for padded_data in padded_data_list:
            for idx, node_store in enumerate(padded_data.node_stores):
                num_padded_nodes_list[idx] += node_store.num_nodes
            for idx, edge_store in enumerate(padded_data.edge_stores):
                num_padded_edges_list[idx] += edge_store.num_edges

        def set_mask(stores, num_padded_list, num_attr, mask_attr):
            for attr, num_padded in zip(stores, num_padded_list):
                num_elems = getattr(attr, num_attr)
                mask = torch.arange(num_elems) < (num_elems - num_padded)
                setattr(attr, mask_attr, mask)

        set_mask(batch.node_stores, num_padded_nodes_list, 'num_nodes',
                 'nodes_mask')
        set_mask(batch.edge_stores, num_padded_edges_list, 'num_edges',
                 'edges_mask')

    def _calc_pad_limits(
            self, data_list: List[BaseData]
    ) -> Union[Tuple[int, int, int, int],
               Tuple[Dict[NodeType, int], Dict[NodeType, int],
                     Dict[NodeType, int], Dict[NodeType, int]]]:

        # Check if all elements in data_list are of the same type
        data_list_types = [type(d) for d in data_list]
        assert data_list_types[:-1] == data_list_types[1:]

        return self._calc_pad_limits_body(data_list[0], data_list)

    @singledispatchmethod
    def _calc_pad_limits_body(self, data, data_list):  # pylint: disable=unused-argument
        raise ValueError(f'Unsupported data type: {type(data)}')

    @_calc_pad_limits_body.register(Data)
    def _(self, _, data_list: List[Data]) -> Tuple[int, int, int, int]:
        def calc_pad_limits_attr(data_list, attr, fixed_size):
            data_num_attr = sum(getattr(d, attr) for d in data_list)
            num_pad_attr = fixed_size - data_num_attr
            return data_num_attr, num_pad_attr

        num_real_nodes, num_pad_nodes = calc_pad_limits_attr(
            data_list, 'num_nodes', self.opts.num_nodes)
        num_real_edges, num_pad_edges = calc_pad_limits_attr(
            data_list, 'num_edges', self.opts.num_edges)

        return num_real_nodes, num_pad_nodes, num_real_edges, num_pad_edges

    @_calc_pad_limits_body.register(HeteroData)
    def _(self, _, data_list: List[HeteroData]
          ) -> Tuple[Dict[NodeType, int], Dict[NodeType, int],
                     Dict[EdgeType, int], Dict[EdgeType, int]]:
        real_nodes_nums = dict()
        pad_nodes_nums = dict()
        real_edges_nums = dict()
        pad_edges_nums = dict()
        for data_ in data_list:
            for node_type in data_.node_types:
                num_real_nodes = real_nodes_nums.get(
                    node_type, 0) + data_[node_type].x.shape[0]
                real_nodes_nums[node_type] = num_real_nodes

                if isinstance(self.opts.num_nodes, dict):
                    assert node_type in self.opts.num_nodes, (
                        f"Node type {node_type} exists in the data"
                        " but not in the fixed size options. Ensure"
                        " your fixed size options specify a `num_nodes`"
                        f" for node type {node_type}.")
                    num_pad_nodes = self.opts.num_nodes[
                        node_type] - num_real_nodes
                else:
                    num_pad_nodes = self.opts.num_nodes - num_real_nodes
                pad_nodes_nums[node_type] = num_pad_nodes

            for edge_type in data_.edge_types:
                num_real_edges = real_edges_nums.get(
                    edge_type, 0) + data_[edge_type].edge_index.shape[1]
                real_edges_nums[edge_type] = num_real_edges

                if isinstance(self.opts.num_edges, dict):
                    assert edge_type in self.opts.num_edges, (
                        f"Edge type {edge_type} exists in the data"
                        " but not in the fixed size options. Ensure"
                        " your fixed size options specify a `num_edges`"
                        f" for edge type {edge_type}.")
                    num_pad_edges = self.opts.num_edges[
                        edge_type] - num_real_edges
                else:
                    num_pad_edges = self.opts.num_edges - num_real_edges
                pad_edges_nums[edge_type] = num_pad_edges

        return real_nodes_nums, pad_nodes_nums, real_edges_nums, pad_edges_nums

    def _create_padded_data(
            self, data_list: List[BaseData],
            data_to_pad_dict: Dict[Union[NodeType, EdgeType, str], Any],
            num_nodes: int, num_edges: int) -> BaseData:
        """Create a new empty data instance (type specified based on the
        'data_list' input) padded to num_nodes and num_edges.
        """
        data = data_list[0]
        data_type = type(data)
        data_to_pad = _generate_data_to_pad.dispatch(data_type)(
            data_to_pad_dict)
        pad_op = Pad(max_num_nodes=num_nodes,
                     max_num_edges=num_edges,
                     node_pad_value=self.opts.node_pad_value,
                     edge_pad_value=self.opts.edge_pad_value,
                     exclude_keys=self.exclude_keys)
        padded_data = pad_op(data_to_pad)

        # Because Pad op does not pad graph values, this needs to be done
        # in a separate step.
        self._pad_graph_values(padded_data, data)

        return padded_data

    def _prune_edges(self, data_list):
        return self._prune_edges_body(data_list[0], data_list)

    @singledispatchmethod
    def _prune_edges_body(self, data, data_list):  # pylint: disable=unused-argument
        raise ValueError(f'Unsupported data type: {type(data)}')

    @_prune_edges_body.register(Data)
    def _(self, _, data_list: List[Data]) -> List[Data]:
        edge_slices, preserve_edges_mask = create_slices_and_preserve_mask(
            self.opts.num_edges, [d.num_edges for d in data_list])

        # There is nothing to prune.
        if edge_slices is None:
            return data_list

        # Apply the preservation masks to the data_list to finally trim edges.
        return [
            data.edge_subgraph(preserve_edges_mask[slc])
            for data, slc in zip(data_list, edge_slices)
        ]

    @_prune_edges_body.register(HeteroData)
    def _(self, data: HeteroData,
          data_list: List[HeteroData]) -> List[HeteroData]:
        edge_types = data.edge_types
        preserve_edges_masks_dict = dict()
        edge_slices_dict = dict()

        for edge_type in edge_types:
            edge_slices, preserve_edges_mask = create_slices_and_preserve_mask(
                self.opts.num_edges[edge_type],
                [d[edge_type].edge_index.shape[1] for d in data_list])
            preserve_edges_masks_dict[edge_type] = preserve_edges_mask
            edge_slices_dict[edge_type] = edge_slices

        return [
            data.edge_subgraph({
                edge_type: preserve_edges_masks_dict[edge_type][
                    edge_slices_dict[edge_type][idx]]
                for edge_type in edge_types
                if edge_slices_dict[edge_type] is not None
            }) for idx, data in enumerate(data_list)
        ]

    def _prune_nodes(self, data_list):
        return self._prune_nodes_body(data_list[0], data_list)

    @singledispatchmethod
    def _prune_nodes_body(self, data, data_list):  # pylint: disable=unused-argument
        raise ValueError(f'Unsupported data type: {type(data)}')

    @_prune_nodes_body.register(Data)
    def _(self, _, data_list: List[BaseData]) -> List[BaseData]:
        num_graphs_to_trim = len(data_list)
        if self.opts.num_nodes < num_graphs_to_trim:
            raise RuntimeError(
                f'The number of nodes to trim to ({self.opts.num_nodes})'
                ' is less than the number of graphs in the batch'
                f' ({num_graphs_to_trim}), which would result in empty'
                ' graphs.')

        nodes_slices, preserve_nodes_mask = create_slices_and_preserve_mask(
            self.opts.num_nodes, [d.num_nodes for d in data_list])

        # There is nothing to prune.
        if nodes_slices is None:
            return data_list

        # Apply the preservation masks to the data_list  to finally trim nodes.
        return [
            data.subgraph(preserve_nodes_mask[slice])
            for data, slice in zip(data_list, nodes_slices)
        ]

    @_prune_nodes_body.register(HeteroData)
    def _(self, data: HeteroData,
          data_list: List[HeteroData]) -> List[HeteroData]:
        node_types = data.node_types
        num_graphs_to_trim = len(data_list)
        preserve_nodes_masks_dict = dict()
        node_slices_dict = dict()

        for node_type in node_types:
            if self.opts.num_nodes[node_type] < num_graphs_to_trim:
                raise RuntimeError(
                    f'The number of nodes to trim to ({self.opts.num_nodes})'
                    f' for node type {node_type} is less than the number'
                    f' of graphs in the batch ({num_graphs_to_trim}), which'
                    ' would result in empty graphs.')
            node_slices, preserve_nodes_mask = create_slices_and_preserve_mask(
                self.opts.num_nodes[node_type],
                [d[node_type].num_nodes for d in data_list])
            preserve_nodes_masks_dict[node_type] = preserve_nodes_mask
            node_slices_dict[node_type] = node_slices

        return [
            data.subgraph({
                node_type: preserve_nodes_masks_dict[node_type][
                    node_slices_dict[node_type][idx]]
                for node_type in data.node_types
                if node_slices_dict[node_type] is not None
            }) for idx, data in enumerate(data_list)
        ]

    @singledispatchmethod
    def _create_structure_dict(self, data):
        """Create a dict representing the structure of the input data. Dict keys
        correspond to the 'data' keys, its values are all defaulted.
        """
        raise ValueError(f'Unsupported data type: {type(data)}')

    @_create_structure_dict.register(Data)
    def _(self, data: Data) -> Dict[NodeType, Any]:
        if self.labels_type == self.LabelsType.NODE_LVL:
            check = lambda key: (key == 'y' and self.labels_type == self.
                                 LabelsType.NODE_LVL) or (data.is_node_attr(
                                     key) or data.is_edge_attr(key))
        else:
            check = lambda key: data.is_node_attr(key) or data.is_edge_attr(key
                                                                            )

        out = dict()
        for key, val in data.to_dict().items():
            if check(key):
                out[key] = _reset_attr(val, key)
        return out

    @_create_structure_dict.register(HeteroData)
    def _(self, data: HeteroData) -> Dict[Union[NodeType, EdgeType], Any]:
        out = dict()
        for key, attr in data._global_store.to_dict().items():  # pylint: disable=protected-access
            out[key] = _reset_attr(attr)
        for key, attr in chain(data.node_items(), data.edge_items()):
            out[key] = {
                k: torch.zeros(_reset_dim(v.shape, k),
                               dtype=data[key][k].dtype)
                for k, v in attr.to_dict().items()
                if isinstance(v, torch.Tensor)
            }
        return out

    @singledispatchmethod
    def _pad_graph_values(self, padded_data, original_data):
        raise ValueError(
            f'Unsupported pair of data types: {type(padded_data)}, '
            f'{type(original_data)}')

    @_pad_graph_values.register(Data)
    def _(self, padded_data: Data, original_data: Data) -> None:
        if self.labels_type == self.LabelsType.NODE_LVL:
            check = lambda key: (
                key == 'y' and self.labels_type == self.LabelsType.GRAPH_LVL
            ) or not (original_data.is_node_attr(key) or original_data.
                      is_edge_attr(key))
        else:
            check = lambda key: not (original_data.is_node_attr(key) or
                                     original_data.is_edge_attr(key))

        for key, value in original_data():
            if key in self.exclude_keys:
                continue
            if check(key):
                self._pad_graph_values_body(padded_data, original_data, key,
                                            value)

    @_pad_graph_values.register(HeteroData)
    def _(self, padded_data: HeteroData, original_data: HeteroData) -> None:
        for key, value in original_data._global_store.items():  # pylint: disable=protected-access
            if key in self.exclude_keys:
                continue
            self._pad_graph_values_body(padded_data, original_data, key, value)

    def _pad_graph_values_body(self, padded_data: BaseData,
                               original_data: BaseData, key: Any,
                               value: Any) -> None:
        if not torch.is_tensor(value):
            padded_data[key] = self.opts.pad_graph_defaults.get(
                key, original_data[key])
        else:
            pad_shape = list(value.shape)
            pad_value = self.opts.graph_pad_value
            padded_data[key] = value.new_full(pad_shape, pad_value)


class CombinedBatchingCollater:
    r"""Manages the combined batch size defined as :obj:`mini_batch_size *
    device_iterations * replication_factor * gradient_accumulation`.

    This class is intended to be used in combination with the
    :class:`poptorch.DataLoader`.

    Args:
        collater (Collater): The collater transforming the list of
            :class:`torch_geometric.data.Data` objects to a
            :obj:`torch_geometric.data.Batch` object.
        mini_bach_size (int, optional): The size of mini batch. If not
            provided, the length of the list provided when calling an instance
            of this class is used. (default: :obj:`None`)
    """

    def __init__(
            self,
            collater: Collater,
            mini_batch_size: Optional[int] = None,
    ) -> None:
        super().__init__()
        self.mini_batch_size = mini_batch_size
        self.collater = collater
        self.parser = types.PyGArgsParser()

    def __call__(self, batch: List[BaseData]) -> Batch:
        num_items = len(batch)
        mini_batch_size = (self.mini_batch_size
                           if self.mini_batch_size is not None else num_items)

        assert num_items % mini_batch_size == 0, \
            'Invalid batch size. ' \
            f'Got {num_items} graphs and ' \
            f'`mini_batch_size={mini_batch_size}`.'

        num_mini_batches = num_items // mini_batch_size

        def batch_slice(batch_id):
            stride = mini_batch_size
            start = batch_id * stride
            return slice(start, start + stride)

        batches = [
            self.collater(batch[batch_slice(batch_id)])
            for batch_id in range(num_mini_batches)
        ]
        batch_tensors = [
            list(self.parser.yieldTensors(batch)) for batch in batches
        ]

        return self.parser.reconstruct(
            batches[0], combine_batch_tensors_gen(batch_tensors))


================================================
FILE: poptorch_geometric/python/common.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

from torch_geometric.data import Batch, Data, HeteroData

DataBatch = type(Batch(_base_cls=Data))
HeteroDataBatch = type(Batch(_base_cls=HeteroData))


def call_once(f):
    def wrapper(*args, **kwargs):
        if not wrapper.has_run:
            wrapper.has_run = True
            return f(*args, **kwargs)
        return None

    wrapper.has_run = False
    return wrapper


================================================
FILE: poptorch_geometric/python/dataloader.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
from __future__ import annotations  # noqa: F407

from typing import List, Optional, Tuple, Union

from torch_geometric.data import Dataset

import poptorch
from poptorch_geometric.collate import CombinedBatchingCollater
from poptorch_geometric.fixed_size_options import FixedSizeOptions
from poptorch_geometric.pyg_dataloader import DataLoader as PyGDataLoader
from poptorch_geometric.pyg_dataloader import FixedSizeDataLoader as PyGFixedSizeDataLoader
from poptorch_geometric.pyg_dataloader import FixedSizeStrategy, OverSizeStrategy


class DataLoader(PyGDataLoader, poptorch.DataLoader):
    r"""A data loader which merges data objects from a
    :py:class:`torch_geometric.data.Dataset` to a mini-batch.
    Data objects can be either of type :py:class:`~torch_geometric.data.Data` or
    :py:class:`~torch_geometric.data.HeteroData`.

    Args:
        dataset (Dataset): The dataset from which to load the data.
        batch_size (int, optional): How many samples per batch to load.
            (default: :obj:`1`)
        shuffle (bool, optional): If set to :obj:`True`, the data will be
            reshuffled at every epoch. (default: :obj:`False`)
        follow_batch (List[str], optional): Creates assignment batch
            vectors for each key in the list. (default: :obj:`None`)
        exclude_keys (List[str], optional): Will exclude each key in the
            list. (default: :obj:`None`)
        options (poptorch.Options, optional): The additional PopTorch options
            to be passed to :py:class:`poptorch.DataLoader`.
            (default: :obj:`None`)
        **kwargs (optional): Additional arguments of
            :py:class:`poptorch.DataLoader`.
    """

    def __init__(
            self,
            dataset: Dataset,
            batch_size: int = 1,
            shuffle: bool = False,
            follow_batch: Optional[Union[List[str], Tuple[str, ...]]] = None,
            exclude_keys: Optional[Union[List[str], Tuple[str, ...]]] = None,
            options: Optional[poptorch.Options] = None,
            **kwargs,
    ):
        self.batch_size = batch_size

        if options is None:
            options = poptorch.Options()

        super().__init__(dataset=dataset,
                         batch_size=batch_size,
                         shuffle=shuffle,
                         follow_batch=follow_batch,
                         exclude_keys=exclude_keys,
                         options=options,
                         **kwargs)

    def _create_collater(self, **collater_args):
        base_collater = super()._create_collater(**collater_args)
        return CombinedBatchingCollater(mini_batch_size=self.batch_size,
                                        collater=base_collater)


class FixedSizeDataLoader(PyGFixedSizeDataLoader, poptorch.DataLoader):
    r"""A data loader which merges data objects from
    :py:class:`poptorch.Dataset` into a mini-batch and pads node and
    edge features so tensors across all mini-batches have the same shapes.

    Data objects can be either of type :py:class:`~torch_geometric.data.Data`
    or :py:class:`~torch_geometric.data.HeteroData`.

    Args:
        dataset (Dataset): The :class:`~torch_geometric.data.Dataset` instance
            from which to load the graph samples.
        batch_size (int, optional): The number of graph samples to load in each
            mini-batch. This should be at least :obj:`2` to allow for creating
            at least one padding graph. (default: :obj:`2`)
        shuffle (bool, optional): If :obj:`True`, the data will be
            reshuffled at every epoch. (default: :obj:`False`)
        fixed_size_options (FixedSizeOptions, optional): A
            :py:class:`poptorch_geometric.fixed_size_options.FixedSizeOptions`
            object which holds the maximum number of nodes, edges and other
            options required to pad the mini-batches, produced by the data
            loader, to a fixed size. If not specified, this will be determined
            from the provided dataset. (default: :obj:`None`)
        fixed_size_strategy (FixedSizeStrategy, optional): The
            strategy to use to achieve fixed-size mini-batches. By default,
            each mini-batch will contain a fixed number of real graphs
            (`batch_size` - 1) plus one single graph for padding.
            (default: `poptorch_geometric.FixedSizeStrategy.PadToMax`)
        over_size_strategy (OverSizeStrategy, optional): The
            behaviour if a sample cannot fit in the fixed-size mini-batch.
            By default, if the required number of samples cannot fit into the
            fixed-sized batch an error will be raised.
            (default: `poptorch_geometric.OverSizeStrategy.Error`)
        add_pad_masks  (bool, optional): If :obj:`True`, mask objects
            are attached to mini-batch result. They represents three levels of
            padding:

            - :obj:`graphs_mask` - graph level mask
            - :obj:`nodes_mask`  - node level mask
            - :obj:`edges_mask`  - edge level mask

            Mask objects indicate which elements in the mini-batch are real
            (represented by :obj:`True`) and which were added as
            padding (represented by :obj:`False`).
            (default: :obj:`True`)
        follow_batch (list or tuple, optional): Creates assignment batch
            vectors for each key in the list. (default: :obj:`None`)
        exclude_keys (list or tuple, optional): Keys to exclude from the
            batch. (default: :obj:`None`)
        options (poptorch.Options, optional): The additional PopTorch options
            to be passed to :py:class:`poptorch.DataLoader`.
            (default: :obj:`None`)
        **kwargs (optional): Additional arguments of
            :py:class:`poptorch.DataLoader`.
    """

    def __init__(
            self,
            dataset: Dataset,
            batch_size: int = 2,
            shuffle: bool = False,
            fixed_size_options: Optional[FixedSizeOptions] = None,
            fixed_size_strategy: FixedSizeStrategy = FixedSizeStrategy.
            PadToMax,
            over_size_strategy: OverSizeStrategy = OverSizeStrategy.Error,
            add_pad_masks: Optional[bool] = True,
            follow_batch: Optional[Union[List[str], Tuple[str, ...]]] = None,
            exclude_keys: Optional[Union[List[str], Tuple[str, ...]]] = None,
            options: Optional[poptorch.Options] = None,
            **kwargs,
    ):
        if options is None:
            # Create IPU default options
            options = poptorch.Options()
        super().__init__(dataset=dataset,
                         batch_size=batch_size,
                         shuffle=shuffle,
                         fixed_size_options=fixed_size_options,
                         fixed_size_strategy=fixed_size_strategy,
                         over_size_strategy=over_size_strategy,
                         add_pad_masks=add_pad_masks,
                         follow_batch=follow_batch,
                         exclude_keys=exclude_keys,
                         options=options,
                         **kwargs)

    def _create_collater(self, **collater_args):
        base_collater = super()._create_collater(**collater_args)
        if self.batch_sampler is not None:
            mini_batch_size = None
        else:
            mini_batch_size = self.padded_batch_size - 1
        return CombinedBatchingCollater(mini_batch_size=mini_batch_size,
                                        collater=base_collater)


================================================
FILE: poptorch_geometric/python/fixed_size_options.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
from typing import Any, Dict, List, Optional, Union

from torch.utils.data import DataLoader
from torch_geometric.data import Dataset, HeteroData
from torch_geometric.data.summary import Summary
from torch_geometric.typing import EdgeType, NodeType


def validate_num_graphs(num_graphs):
    if num_graphs < 2:
        raise ValueError("The number of graphs in the batch must be"
                         " at least 2. This is to ensure the batch"
                         " contains at least 1 real graph and a graph"
                         " reserved for padding the batch to a fixed size.")


class FixedSizeOptions:
    r"""Class that holds the specification of how a data loader can be
    padded up to a fixed size. This includes the number of nodes and
    edges to pad a batch, produced using this specification, to a
    maximum number.

    Args:
        num_nodes (int or dict): The number of nodes after
            padding a batch.
            In heterogeneous graphs, this can be a dictionary denoting
            the number of nodes for specific node types.
        num_edges (int or dict, optional): The number of edges after
            padding a batch.
            In heterogeneous graphs, this can be a dictionary denoting the
            number of edges for specific edge types.
            (default: :obj:`num_nodes * (num_nodes - 1)`)
        num_graphs (int, optional): The total number of graphs
            in the padded batch. This should be at least :obj:`2` to allow
            for creating at least one padding graph. The default value is
            :obj:`2` accounting for a single real graph and a single padded
            graph in a batch.
            (default: :obj:`2`)
        node_pad_value (float, optional): The fill value to use for node
            features. (default: :obj:`0.0`)
        edge_pad_value (float, optional): The fill value to use for edge
            features. (default: :obj:`0.0`)
        graph_pad_value (float, optional): The fill value to use for graph
            features. (default: :obj:`0.0`)
        pad_graph_defaults (dict, optional): The default values that
            will be assigned to the keys of types different to
            :class:`torch.Tensor` in the newly created padding graphs.
            (default: :obj:`None`)
    """

    def __init__(self,
                 num_nodes: Union[int, Dict[NodeType, int]],
                 num_edges: Optional[Union[int, Dict[EdgeType, int]]] = None,
                 num_graphs: int = 2,
                 node_pad_value: Optional[float] = None,
                 edge_pad_value: Optional[float] = None,
                 graph_pad_value: Optional[float] = None,
                 pad_graph_defaults: Optional[Dict[str, Any]] = None):
        self.num_nodes = num_nodes

        if num_edges:
            self.num_edges = num_edges
        else:
            # Assume fully connected graph.
            total_num_nodes = sum(self.num_nodes.values()) if isinstance(
                num_nodes, dict) else num_nodes
            self.num_edges = total_num_nodes * (total_num_nodes - 1)

        validate_num_graphs(num_graphs)
        self.num_graphs = num_graphs

        self.total_num_nodes_hetero = None
        self.total_num_edges_hetero = None

        self.node_pad_value = 0.0 if node_pad_value is None else node_pad_value
        self.edge_pad_value = 0.0 if edge_pad_value is None else edge_pad_value
        self.graph_pad_value = (0.0 if graph_pad_value is None else
                                graph_pad_value)
        self.pad_graph_defaults = ({} if pad_graph_defaults is None else
                                   pad_graph_defaults)

    def is_hetero(self):
        """Returns whether the specified number of nodes and edges are
        in heterogeneous form, ie a number for each node and edge type."""
        return (isinstance(self.num_nodes, dict)
                and isinstance(self.num_edges, dict))

    def to_hetero(self, node_types: List[NodeType],
                  edge_types: List[EdgeType]):
        """Converts a single specified number of nodes and edges to
        a heterogeneous form, a number for each node and edge type."""
        if not isinstance(self.num_nodes, dict):
            self.num_nodes = {k: self.num_nodes for k in node_types}
        if not isinstance(self.num_edges, dict):
            self.num_edges = {k: self.num_edges for k in edge_types}
        return self

    @property
    def total_num_nodes(self):
        """The total number of nodes summed for all the node types."""
        if self.is_hetero():
            if self.total_num_nodes_hetero is None:
                self.total_num_nodes_hetero = sum(self.num_nodes.values())
            return self.total_num_nodes_hetero
        return self.num_nodes

    @property
    def total_num_edges(self):
        """The total number of nodes summed for all the edge types."""
        if self.is_hetero():
            if self.total_num_edges_hetero is None:
                self.total_num_edges_hetero = sum(self.num_edges.values())
            return self.total_num_edges_hetero
        return self.num_edges

    @classmethod
    def from_dataset(cls,
                     dataset: Dataset,
                     batch_size: int,
                     sample_limit: Optional[int] = None,
                     progress_bar: Optional[bool] = None):
        """Returns a `FixedSizeOptions` object which is a valid set of
        options for the given dataset, ensuring that the number of nodes
        and edges allocated are enough for the dataset given a particular
        batch size."""

        validate_num_graphs(batch_size)

        if sample_limit is None:
            sample_limit = len(dataset)

        dataset_summary = Summary.from_dataset(dataset,
                                               progress_bar=progress_bar)

        def get_max_for_batch_size(batch_size, sample_max):
            return int(sample_max) * (batch_size - 1) + 1

        if dataset_summary.num_nodes_per_type:
            max_nodes_per_batch = {
                k: get_max_for_batch_size(batch_size, v.max)
                for k, v in dataset_summary.num_nodes_per_type.items()
            }
        else:
            max_nodes_per_batch = get_max_for_batch_size(
                batch_size, dataset_summary.num_nodes.max)

        if dataset_summary.num_edges_per_type:
            max_edges_per_batch = {
                k: get_max_for_batch_size(batch_size, v.max)
                for k, v in dataset_summary.num_edges_per_type.items()
            }
        else:
            max_edges_per_batch = get_max_for_batch_size(
                batch_size, dataset_summary.num_edges.max)

        return FixedSizeOptions(
            num_nodes=max_nodes_per_batch,
            num_edges=max_edges_per_batch,
            num_graphs=batch_size,
        )

    @classmethod
    def from_loader(cls, loader: DataLoader, sample_limit: int = 1000):
        """Returns a `FixedSizeOptions` object which is a valid set of
        options for the given data loader, ensuring that the number of nodes
        and edges allocated are approximately enough for the mini-batches
        produced by this data loader. As the underlying loader is unlikely
        to produce an exhaustive combination of samples in a mini-batch,
        the `FixedSizeOptions` returned can only be an approximation of the
        maximum values required."""

        is_hetero_data = isinstance(next(iter(loader)), HeteroData)

        max_num_graphs = 0
        max_num_nodes = dict() if is_hetero_data else 0
        max_num_edges = dict() if is_hetero_data else 0

        def loop_with_limit(loader, limit):
            count = 0
            while True:
                for sample in loader:
                    if count >= limit:
                        return
                    count += 1
                    yield sample

        for data in loop_with_limit(loader, sample_limit):
            if is_hetero_data:
                for node_type in data.node_types:
                    max_num_nodes[node_type] = max(
                        max_num_nodes.get(node_type, 0),
                        data[node_type].num_nodes)
                for edge_type in data.edge_types:
                    max_num_edges[edge_type] = max(
                        max_num_edges.get(edge_type, 0),
                        data[edge_type].num_edges)
            else:
                max_num_nodes = max(max_num_nodes, data.num_nodes)
                max_num_edges = max(max_num_edges, data.num_edges)

            if hasattr(data, "num_graphs"):
                max_num_graphs = max(max_num_graphs, data.num_graphs)
            else:
                max_num_graphs = 1

        # Allocate space for padding
        max_num_graphs += 1
        if is_hetero_data:
            max_num_nodes = {k: v + 1 for k, v in max_num_nodes.items()}
            max_num_edges = {k: v + 1 for k, v in max_num_edges.items()}
        else:
            max_num_nodes += 1
            max_num_edges += 1

        return FixedSizeOptions(
            num_nodes=max_num_nodes,
            num_edges=max_num_edges,
            num_graphs=max_num_graphs,
        )

    def __repr__(self):
        return (f"{self.__class__.__name__}("
                f"num_nodes={self.num_nodes}"
                " (At least one node reserved for padding), "
                f"num_edges={self.num_edges}"
                " (At least one edge reserved for padding), "
                f"num_graphs={self.num_graphs}"
                " (At least one graph reserved for padding), "
                f"node_pad_value={self.node_pad_value}, "
                f"edge_pad_value={self.edge_pad_value}, "
                f"graph_pad_value={self.graph_pad_value}, "
                f"pad_graph_defaults={self.pad_graph_defaults})")


================================================
FILE: poptorch_geometric/python/masker.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
"""
Provides an interface that reduces coupling between padding which
happens in the dataloader and the masking which needs to happen in
the model.

The idea is fairly simple: the dataloader defines the masking strategy
for nodes, edges, and graphs. The IPU GNNs consume that interface, and
it is easy to make the mask operations no-ops for compatibility with other
hardware.

### Expected usage pattern

```python
import torch_geometric as pyg
from torch import nn
import poptorch

class IpuGNN(pyg.SomeGNN):

    def __init__(self, masker: Masker = NoMasker(), *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.masker = masker  # New line to support this pattern
        self.loss = nn.CrossEntropyLoss()

    def forward(self, node_mask, y, *args, **kwargs):
        '''Common poptorch usage pattern of needing to put the loss in the
        forward'''
        node_prediction = super().forward(*args, **kwargs)
        # clear interface for model code to program to
        masked_pred = self.masker.node_masker(node_prediction, node_mask)
        if self.training:
            return masked_pred, self.loss(y, masked_pred)
        return masked_pred


options = poptorch.Options()
dataloader = poptorch_geometric.create_dataloader(
    dataset=dataset,
    num_nodes=6000,
    options=options,
    fixed_size=True,
    collater_args={
        "num_edges": 12000,
    },
)

model = IpuGNN(dataloader.masker)
train_model = poptorch.TrainingModel(model, options=options, ...)

for data in dataloader:
    # Need to pass the mask as an extra argument.
    train_model(data.node_mask, data.y, ...)
```

### Expected benefit

The big benefit is it pushes the responsibility of writing the masking
functions to the same piece of code that also implements the padding and
generates the node mask.

It means consumers of a dataloader don't have to worry about implementation
details.
"""
import abc
from typing import Callable, Optional, Tuple, Union

import torch

Entries = Union[torch.Tensor, Tuple[torch.Tensor, ...]]
Mask = Optional[torch.Tensor]
Layer = Callable[[torch.Tensor], torch.Tensor]
DecoratedLayer = Callable[[torch.Tensor], torch.Tensor]


class Masker(abc.ABC):
    """
    The masker provides a way to decouple the model from the
    implementation of the dataloading. We provide a stable interface
    for masking padded data and graphs.

    Dataloaders that implement padding should also generates masking functions
    for you by either implementing this :class:`Masker` interface or by
    composing a `layer_mask` attribute to the class. Models which are
    compatible can then use those masks as intermediate layers before the loss
    or before pooling operations to avoid the back propagation:

    ```python
    class Net(Module):
        def __init__(self, layer_mask):
            self.node_layer = pyg.GraphConv()
            self.masker = layer_mask
            self.loss = nn.loss()

        def forward(self, x, y, mask):
            x = self.node_layer(x)
            x = self.node_layer(x)
            x = self.node_layer(x)
            pred = self.masker.node_masker(x, mask=mask)
            return loss(y, pred)
    ```

    By implementing this interface we let the user change their dataloading
    Pipeline without having to go into the code of model.

    Note:
        Code in the node, edge and graph masker will be run on the IPU and
        needs to be compatible with torch.jit.trace.
    """

    @abc.abstractmethod
    def node_masker(self, node_entries: Entries, mask: Mask = None) -> Entries:
        """Masks out nodes which were added by padding/batching/clustering"""

    @abc.abstractmethod
    def edge_masker(self, edge_entries: Entries, mask: Mask = None) -> Entries:
        """Masks out edges which were added by padding/batching/clustering"""

    @abc.abstractmethod
    def graph_masker(self, graph_entries: Entries,
                     mask: Mask = None) -> Entries:
        """Masks out graphs which were added by padding/batching/clustering"""


class NoMasker(Masker):
    """A null op masker to give when masking is unnecessary"""

    def node_masker(self, node_entries: Entries, mask: Mask = None) -> Entries:
        return node_entries

    def edge_masker(self, edge_entries: Entries, mask: Mask = None) -> Entries:
        return edge_entries

    def graph_masker(self, graph_entries: Entries,
                     mask: Mask = None) -> Entries:
        return graph_entries


class LayerMasker(abc.ABC):
    """
    The layer masker provides a way to decouple the model from the
    implementation of the dataloading. We provide a stable interface
    for masking layers which need to operated on padded data and graphs.

    Note:
        This is an alternative proposal to the :class:`Masker` above. It
        differs by proposing we use decoration of the layers instead of
        calling in between the layers.

        The decoration approach might help handle cases where a lot of
        masking is necessary by decorating layers defined in the
        `__init__` of a `Module` removing the need for changing the
        forward method.

    This default implementation is sufficient for layers which only take
    tensors that will be masked according to the same attribute (node, edge,
    or graph) this will not handle a layer which needs two tensors one related
    to edges and one related to nodes.
    """

    def __init__(self, masker: Masker) -> None:
        super().__init__()
        self.masker = masker

    @abc.abstractmethod
    def node_masker(self, layer: Layer) -> DecoratedLayer:
        def masked_layer(*args, mask=None):
            return layer(*self.masker.node_masker(args, mask=mask))

        return masked_layer

    @abc.abstractmethod
    def edge_masker(self, layer: Layer) -> DecoratedLayer:
        def masked_layer(*args, mask=None):
            return layer(*self.masker.edge_masker(args, mask=mask))

        return masked_layer

    @abc.abstractmethod
    def graph_masker(self, layer: Layer) -> DecoratedLayer:
        def masked_layer(*args, mask=None):
            return layer(*self.masker.graph_masker(args, mask=mask))

        return masked_layer


class PreLayerMasker(LayerMasker):
    """Simplest Layer masker"""

    # pylint: disable=useless-super-delegation
    def node_masker(self, layer: Layer) -> DecoratedLayer:
        return super().node_masker(layer)

    def edge_masker(self, layer: Layer) -> DecoratedLayer:
        return super().edge_masker(layer)

    def graph_masker(self, layer: Layer) -> DecoratedLayer:
        return super().graph_masker(layer)


================================================
FILE: poptorch_geometric/python/neighbor_loader.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

from typing import Callable, Dict, List, Optional, Tuple, Union

import torch

from torch_geometric.data import Data, FeatureStore, GraphStore, HeteroData
from torch_geometric.data.data import BaseData
from torch_geometric.loader import NeighborLoader
from torch_geometric.loader.utils import get_input_nodes
from torch_geometric.sampler import NeighborSampler
from torch_geometric.sampler.base import SubgraphType
from torch_geometric.typing import EdgeType, InputNodes, OptTensor

import poptorch

from poptorch_geometric.collate import FixedSizeCollater
from poptorch_geometric.collate import CombinedBatchingCollater
from poptorch_geometric import OverSizeStrategy
from poptorch_geometric.fixed_size_options import FixedSizeOptions


class PyGFixedSizeNeighborLoader(torch.utils.data.DataLoader):
    def __init__(
            self,
            data: Union[Data, HeteroData, Tuple[FeatureStore, GraphStore]],
            num_neighbors: Union[List[int], Dict[EdgeType, List[int]]],
            input_nodes: InputNodes = None,
            input_time: OptTensor = None,
            replace: bool = False,
            directed: bool = True,
            disjoint: bool = False,
            temporal_strategy: str = 'uniform',
            time_attr: Optional[str] = None,
            transform: Optional[Callable] = None,
            transform_sampler_output: Optional[Callable] = None,
            is_sorted: bool = False,
            filter_per_worker: bool = True,
            subgraph_type: SubgraphType = SubgraphType.directional,
            batch_size: int = 1,
            neighbor_sampler: Optional[NeighborSampler] = None,
            over_size_strategy: OverSizeStrategy = OverSizeStrategy.
            TrimNodesAndEdges,
            fixed_size_options: FixedSizeOptions = None,
            add_pad_masks: Optional[bool] = False,
            follow_batch: Optional[Union[List[str], Tuple[str, ...]]] = None,
            exclude_keys: Optional[Union[List[str], Tuple[str, ...]]] = None,
            options: Optional[poptorch.Options] = None,
            **kwargs,
    ):
        kwargs['batch_size'] = batch_size
        self.neighbour_loader = NeighborLoader(
            data,
            num_neighbors,
            input_nodes=input_nodes,
            input_time=input_time,
            replace=replace,
            subgraph_type=subgraph_type,
            directed=directed,
            disjoint=disjoint,
            temporal_strategy=temporal_strategy,
            time_attr=time_attr,
            transform=transform,
            transform_sampler_output=transform_sampler_output,
            is_sorted=is_sorted,
            filter_per_worker=filter_per_worker,
            neighbor_sampler=neighbor_sampler,
            **kwargs)
        self.input_type, input_nodes = get_input_nodes(data, input_nodes)

        if fixed_size_options is None:
            fixed_size_options = FixedSizeOptions.from_loader(
                self.neighbour_loader)

        collater_args = {}
        collater_args['fixed_size_options'] = fixed_size_options
        collater_args['add_masks_to_batch'] = add_pad_masks
        collater_args['follow_batch'] = follow_batch
        collater_args['exclude_keys'] = exclude_keys
        collater_args['trim_nodes'] = (over_size_strategy in (
            OverSizeStrategy.TrimNodes, OverSizeStrategy.TrimNodesAndEdges))
        collater_args['trim_edges'] = (over_size_strategy in (
            OverSizeStrategy.TrimEdges, OverSizeStrategy.TrimNodesAndEdges))

        kwargs['options'] = options
        collater = self._create_collater(**collater_args)
        super().__init__(dataset=range(input_nodes.size(0)),
                         collate_fn=collater,
                         **kwargs)

    def __collate__(self, index):
        out = self.nativeCollate(index)
        out = self.fixedSizeCollate(out)
        return out

    def _create_collater(self, **collater_args):
        self.fixed_size_collater = FixedSizeCollater(**collater_args)
        return self.__collate__

    def nativeCollate(self, index):
        out = self.neighbour_loader(index)
        return out

    def fixedSizeCollate(self, data_list: List[BaseData]):

        # Some keys are not handled correctly by FixedSizeCollater
        # so they need to be temporarily removed
        sample_batch_size = data_list[self.input_type].pop(
            "batch_size") if self.input_type else data_list.pop("batch_size")
        input_id = data_list[self.input_type].pop(
            "input_id") if self.input_type else data_list.pop("input_id")

        out = self.fixed_size_collater([data_list])

        # Restore previously removed keys
        if self.input_type:
            out[self.input_type].batch_size = sample_batch_size
            out[self.input_type].input_id = input_id
        else:
            out.batch_size = sample_batch_size
            out.input_id = input_id
        return out


class FixedSizeNeighborLoader(PyGFixedSizeNeighborLoader, poptorch.DataLoader):
    r"""A data loader which merges data objects from a
    :py:class:`torch_geometric.loader.NeighborLoader` to a mini-batch and pads
    node and edge features so tensors across all batches have constant shapes.

    Args:
        data (Any): A :class:`~torch_geometric.data.Data`,
            :class:`~torch_geometric.data.HeteroData`, or
            (:class:`~torch_geometric.data.FeatureStore`,
            :class:`~torch_geometric.data.GraphStore`) data object.
        num_neighbors (List[int] or Dict[Tuple[str, str, str], List[int]]): The
            number of neighbours to sample for each node in each iteration.
            If an entry is set to :obj:`-1`, all neighbors will be included.
            In heterogeneous graphs, it may also take a dictionary denoting
            the number of neighbours to sample for each individual edge type.
        input_nodes (torch.Tensor or str or Tuple[str, torch.Tensor]): The
            indices of nodes for which neighbours are sampled to create
            mini-batches.
            Needs to be either given as a :obj:`torch.LongTensor` or
            :obj:`torch.BoolTensor`.
            If set to :obj:`None`, all nodes will be considered.
            In heterogeneous graphs, this needs to be passed as a tuple that
            holds the node type and node indices. (default: :obj:`None`)
        input_time (torch.Tensor, optional): Optional values to override the
            timestamp for the input nodes given in :obj:`input_nodes`. If not
            set, it will use the timestamps in :obj:`time_attr` as default (if
            present). The :obj:`time_attr` needs to be set for this to work.
            (default: :obj:`None`)
        replace (bool, optional): If set to :obj:`True`, will sample with
            replacement. (default: :obj:`False`)
        subgraph_type (SubgraphType or str, optional): The type of the returned
            subgraph.
            If set to :obj:`"directional"`, the returned subgraph only holds
            the sampled (directed) edges which are necessary to compute
            representations for the sampled seed nodes.
            If set to :obj:`"bidirectional"`, sampled edges are converted to
            bidirectional edges.
            If set to :obj:`"induced"`, the returned subgraph contains the
            induced subgraph of all sampled nodes.
            (default: :obj:`"directional"`)
        disjoint (bool, optional): If set to :obj:`True`, each seed node will
            create its own disjoint subgraph.
            If set to :obj:`True`, mini-batch outputs will have a :obj:`batch`
            vector holding the mapping of nodes to their respective subgraph.
            This will get automatically set to :obj:`True` in the case of
            temporal sampling. (default: :obj:`False`)
        temporal_strategy (str, optional): The sampling strategy when using
            temporal sampling (:obj:`"uniform"`, :obj:`"last"`).
            If set to :obj:`"uniform"`, it will sample uniformly across
            neighbours that fulfill temporal constraints.
            If set to :obj:`"last"`, will sample the last `num_neighbors` that
            fulfill temporal constraints.
            (default: :obj:`"uniform"`)
        time_attr (str, optional): The name of the attribute that denotes
            timestamps for the nodes in the graph.
             If set, temporal sampling will be used so that neighbours are
            guaranteed to fulfill temporal constraints; that is, neighbours have
            an earlier or equal timestamp than the centre node.
            (default: :obj:`None`)
        transform (callable, optional): A function/transform that takes in
            a sampled mini-batch and returns a transformed version.
            (default: :obj:`None`)
        transform_sampler_output (callable, optional): A function/transform
            that takes in a :class:`torch_geometric.sampler.SamplerOutput` and
            returns a transformed version. (default: :obj:`None`)
        is_sorted (bool, optional): If set to :obj:`True`, assumes that
            :obj:`edge_index` is sorted by column.
            If :obj:`time_attr` is set, additionally requires that rows are
            sorted by to time within individual neighbourhoods.
            This avoids internal re-sorting of the data and can improve
            runtime and memory efficiency. (default: :obj:`False`)
        filter_per_worker (bool, optional): This is left for argument
            compatibility with :obj:`NeighborLoader`. The passed value is
            ignored, FixedSizeNeighborLoader acts like filter_per_worker=True
        fixed_size_options (FixedSizeOptions, optional): A
            :py:class:`poptorch_geometric.fixed_size_options.FixedSizeOptions`
            object which holds the maximum number of nodes, edges and other
            options required to pad the mini-batches, produced by the data
            loader, to a fixed size.
        batch_size (int, optional): The number of nodes per mini-batch to
            load. (default: :obj:`1`)
        over_size_strategy (OverSizeStrategy, optional): The
            behaviour if a sample cannot fit in the fixed-size mini-batch.
            By default, if the required number of samples cannot fit into the
            fixed-sized mini-batch, nodes and edges will be removed from the
            mini-batch to achieve the specified fixed size.
            (default: `poptorch_geometric.OverSizeStrategy.TrimNodesAndEdges`)
        add_pad_masks (bool, optional): If :obj:`True`, mask objects
            are attached to mini-batch result. They represents three levels of
            padding:

            - :obj:`graphs_mask`: graph level mask
            - :obj:`nodes_mask`: node level mask
            - :obj:`edges_mask`: edge level mask

            Mask objects indicate which elements in the mini-batch are real
            (represented by :obj:`True`) and which were added as
            padding (represented by :obj:`False`).
            (default: :obj:`True`)
        options (poptorch.Options, optional): The additional PopTorch options
            to be passed to :py:class:`poptorch.DataLoader`.
            (default: :obj:`None`)
        exclude_keys (list or tuple, optional): The keys to exclude
            from the graphs in the output batch. (default: :obj:`None`)
        **kwargs (optional): Additional arguments of
            :class:`torch.utils.data.DataLoader`, such as :obj:`shuffle`,
            :obj:`drop_last` or :obj:`num_workers`.

    """

    def __init__(
            self,
            data: Union[Data, HeteroData, Tuple[FeatureStore, GraphStore]],
            num_neighbors: Union[List[int], Dict[EdgeType, List[int]]],
            input_nodes: InputNodes = None,
            input_time: OptTensor = None,
            subgraph_type: SubgraphType = SubgraphType.directional,
            replace: bool = False,
            directed: bool = True,
            disjoint: bool = False,
            temporal_strategy: str = 'uniform',
            time_attr: Optional[str] = None,
            transform: Optional[Callable] = None,
            transform_sampler_output: Optional[Callable] = None,
            is_sorted: bool = False,
            filter_per_worker: bool = True,  # Ignored
            batch_size: int = 1,
            neighbor_sampler: Optional[NeighborSampler] = None,
            over_size_strategy: OverSizeStrategy = OverSizeStrategy.
            TrimNodesAndEdges,
            fixed_size_options: FixedSizeOptions = None,
            add_pad_masks: Optional[bool] = True,
            follow_batch: Optional[Union[List[str], Tuple[str, ...]]] = None,
            exclude_keys: Optional[Union[List[str], Tuple[str, ...]]] = None,
            options: Optional[poptorch.Options] = None,
            **kwargs,
    ):
        self.batch_size = batch_size

        if options is None:
            # Create IPU default options
            options = poptorch.Options()

        super().__init__(
            data,
            num_neighbors,
            input_nodes=input_nodes,
            input_time=input_time,
            replace=replace,
            directed=directed,
            disjoint=disjoint,
            subgraph_type=subgraph_type,
            temporal_strategy=temporal_strategy,
            time_attr=time_attr,
            transform=transform,
            transform_sampler_output=transform_sampler_output,
            is_sorted=is_sorted,
            filter_per_worker=True,
            batch_size=batch_size,
            neighbor_sampler=neighbor_sampler,
            over_size_strategy=over_size_strategy,
            fixed_size_options=fixed_size_options,
            add_pad_masks=add_pad_masks,
            follow_batch=follow_batch,
            exclude_keys=exclude_keys,
            options=options,
            **kwargs,
        )

    def _create_collater(self, **collater_args):
        collater = super()._create_collater(**collater_args)
        return CombinedBatchingCollater(mini_batch_size=self.batch_size,
                                        collater=collater)


================================================
FILE: poptorch_geometric/python/ops/__init__.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

from .aggregation_base import Aggregation
from .cluster_gcn_conv import ClusterGCNConv
from .hetero_linear import HeteroLinear
from .instance_norm import InstanceNorm
from .knn import knn
from .knn_graph import knn_graph
from .knn_interpolate import knn_interpolate
from .mf_conv import MFConv
from .radius import radius, radius_graph

__all__ = [
    'Aggregation',
    'ClusterGCNConv',
    'HeteroLinear',
    'InstanceNorm',
    'knn',
    'knn_graph',
    'knn_interpolate',
    'MFConv',
    'radius',
    'radius_graph',
]


================================================
FILE: poptorch_geometric/python/ops/aggregation_base.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
from typing import Optional
from torch import Tensor
import torch_geometric


class Aggregation(torch_geometric.nn.aggr.Aggregation):
    def assert_sorted_index(self, index: Optional[Tensor]):
        pass


================================================
FILE: poptorch_geometric/python/ops/cluster_gcn_conv.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
#
# This file includes content from PyTorch Geometric which
# has been modified by Graphcore Ltd.
#
# Copyright (c) 2023 PyG Team <team@pyg.org>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

# Modified version of ClusterGCNConv that does not use dynamic shapes.
# Original pytorch_geometric version will be replaced by this code.
import torch
from torch import Tensor

from torch_geometric.nn.conv import MessagePassing
from torch_geometric.nn.dense.linear import Linear
from torch_geometric.typing import Adj, OptTensor, SparseTensor, torch_sparse
from torch_geometric.utils import (
    add_self_loops,
    degree,
    remove_self_loops,
    spmm,
)


# pylint: disable=abstract-method, arguments-differ, no-value-for-parameter
class ClusterGCNConv(MessagePassing):
    r"""The ClusterGCN graph convolutional operator from the
    `"Cluster-GCN: An Efficient Algorithm for Training Deep and Large Graph
    Convolutional Networks" <https://arxiv.org/abs/1905.07953>`_ paper
    .. math::
        \mathbf{X}^{\prime} = \left( \mathbf{\hat{A}} + \lambda \cdot
        \textrm{diag}(\mathbf{\hat{A}}) \right) \mathbf{X} \mathbf{W}_1 +
        \mathbf{X} \mathbf{W}_2
    where :math:`\mathbf{\hat{A}} = {(\mathbf{D} + \mathbf{I})}^{-1}(\mathbf{A}
    + \mathbf{I})`.
    Args:
        in_channels (int): Size of each input sample, or :obj:`-1` to derive
            the size from the first input(s) to the forward method.
        out_channels (int): Size of each output sample.
        diag_lambda (float, optional): Diagonal enhancement value
            :math:`\lambda`. (default: :obj:`0.`)
        add_self_loops (bool, optional): If set to :obj:`False`, will not add
            self-loops to the input graph. (default: :obj:`True`)
        bias (bool, optional): If set to :obj:`False`, the layer will not learn
            an additive bias. (default: :obj:`True`)
        **kwargs (optional): Additional arguments of
            :class:`torch_geometric.nn.conv.MessagePassing`.
    Shapes:
        - **input:**
          node features :math:`(|\mathcal{V}|, F_{in})`,
          edge indices :math:`(2, |\mathcal{E}|)`
        - **output:** node features :math:`(|\mathcal{V}|, F_{out})`
    """

    def __init__(self,
                 in_channels: int,
                 out_channels: int,
                 diag_lambda: float = 0.,
                 add_self_loops: bool = True,
                 bias: bool = True,
                 **kwargs):
        kwargs.setdefault('aggr', 'add')
        super().__init__(**kwargs)

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.diag_lambda = diag_lambda
        self.add_self_loops = add_self_loops

        self.lin_out = Linear(in_channels,
                              out_channels,
                              bias=bias,
                              weight_initializer='glorot')
        self.lin_root = Linear(in_channels,
                               out_channels,
                               bias=False,
                               weight_initializer='glorot')

        self.reset_parameters()

    def reset_parameters(self):
        super().reset_parameters()
        self.lin_out.reset_parameters()
        self.lin_root.reset_parameters()

    def forward(self, x: Tensor, edge_index: Adj) -> Tensor:
        edge_weight: OptTensor = None
        if isinstance(edge_index, Tensor):
            num_nodes = x.size(self.node_dim)
            if self.add_self_loops:
                edge_index, _ = remove_self_loops(edge_index)
                edge_index, _ = add_self_loops(edge_index, num_nodes=num_nodes)

            row, col = edge_index[0], edge_index[1]
            deg_inv = 1. / degree(col, num_nodes=num_nodes).clamp_(1.)

            edge_weight = deg_inv[col]
            eq = torch.eq(row, col)
            broadcast = torch.index_select(self.diag_lambda * deg_inv, 0, row)
            tmp = torch.mul(eq.float(), broadcast)
            edge_weight = torch.add(edge_weight, tmp)

        elif isinstance(edge_index, SparseTensor):
            if self.add_self_loops:
                edge_index = torch_sparse.set_diag(edge_index)

            col, row, _ = edge_index.coo()  # Transposed.
            deg_inv = 1. / torch_sparse.sum(edge_index, dim=1).clamp_(1.)

            edge_weight = deg_inv[col]
            eq = torch.eq(row, col)
            broadcast = torch.index_select(self.diag_lambda * deg_inv, 0, row)
            tmp = torch.mul(eq.float(), broadcast)
            edge_weight = torch.add(edge_weight, tmp)
            edge_index = edge_index.set_value(edge_weight, layout='coo')

        # propagate_type: (x: Tensor, edge_weight: OptTensor)
        out = self.propagate(edge_index,
                             x=x,
                             edge_weight=edge_weight,
                             size=None)
        out = self.lin_out(out) + self.lin_root(x)

        return out

    def message(self, x_j: Tensor, edge_weight: Tensor) -> Tensor:
        return edge_weight.view(-1, 1) * x_j

    def message_and_aggregate(self, adj_t: SparseTensor, x: Tensor) -> Tensor:
        return spmm(adj_t, x, reduce=self.aggr)

    def __repr__(self) -> str:
        return (f'{self.__class__.__name__}({self.in_channels}, '
                f'{self.out_channels}, diag_lambda={self.diag_lambda})')


================================================
FILE: poptorch_geometric/python/ops/hetero_linear.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
#
# This file includes content from PyTorch Geometric which
# has been modified by Graphcore Ltd.
#
# Copyright (c) 2023 PyG Team <team@pyg.org>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

import torch
import torch_geometric
from torch import Tensor


class HeteroLinear(torch_geometric.nn.dense.linear.HeteroLinear):
    r"""Applies separate linear tranformations to the incoming data according
    to types

    .. math::
        \mathbf{x}^{\prime}_{\kappa} = \mathbf{x}_{\kappa}
        \mathbf{W}^{\top}_{\kappa} + \mathbf{b}_{\kappa}

    for type :math:`\kappa`.
    It supports lazy initialization and customizable weight and bias
    initialization.

    Args:
        in_channels (int): Size of each input sample. Will be initialized
            lazily in case it is given as :obj:`-1`.
        out_channels (int): Size of each output sample.
        num_types (int): The number of types.
        is_sorted (bool, optional): If set to :obj:`True`, assumes that
            :obj:`type_vec` is sorted. This avoids internal re-sorting of the
            data and can improve runtime and memory efficiency.
            (default: :obj:`False`)
        **kwargs (optional): Additional arguments of
            :class:`torch_geometric.nn.Linear`.

    Shapes:
        - **input:**
          features :math:`(*, F_{in})`,
          type vector :math:`(*)`
        - **output:** features :math:`(*, F_{out})`
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def forward(self, x: Tensor, type_vec: Tensor) -> Tensor:
        r"""
        Args:
            x (torch.Tensor): The input features.
            type_vec (torch.Tensor): A vector that maps each entry to a type.
        """
        out = x.new_empty(x.size(0), self.out_channels)
        for i in range(self.num_types):
            mask = torch.eq(type_vec, i).view(-1, 1)
            x_type_i = torch.where(mask, x, 0.0)
            out_type_i = torch.nn.functional.linear(x_type_i, self.weight[i].T)
            out = torch.where(mask, out_type_i, out)

        if self.bias is not None:
            out += self.bias[type_vec]
        return out


================================================
FILE: poptorch_geometric/python/ops/instance_norm.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
#
# This file includes content from PyTorch Geometric which
# has been modified by Graphcore Ltd.
#
# Copyright (c) 2023 PyG Team <team@pyg.org>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

from typing import Optional

import torch.nn.functional as F
from torch import Tensor
from torch.nn.modules.instancenorm import _InstanceNorm

from torch_geometric.typing import OptTensor
from torch_geometric.utils import degree, scatter


# pylint: disable=abstract-method, arguments-differ, useless-super-delegation
class InstanceNorm(_InstanceNorm):
    r"""Applies instance normalization over each individual example in a batch
    of node features as described in the `"Instance Normalization: The Missing
    Ingredient for Fast Stylization" <https://arxiv.org/abs/1607.08022>`_
    paper

    .. math::
        \mathbf{x}^{\prime}_i = \frac{\mathbf{x} -
        \textrm{E}[\mathbf{x}]}{\sqrt{\textrm{Var}[\mathbf{x}] + \epsilon}}
        \odot \gamma + \beta

    The mean and standard-deviation are calculated per-dimension separately for
    each object in a mini-batch.

    Args:
        in_channels (int): Size of each input sample.
        eps (float, optional): A value added to the denominator for numerical
            stability. (default: :obj:`1e-5`)
        momentum (float, optional): The value used for the running mean and
            running variance computation. (default: :obj:`0.1`)
        affine (bool, optional): If set to :obj:`True`, this module has
            learnable affine parameters :math:`\gamma` and :math:`\beta`.
            (default: :obj:`False`)
        track_running_stats (bool, optional): If set to :obj:`True`, this
            module tracks the running mean and variance, and when set to
            :obj:`False`, this module does not track such statistics and always
            uses instance statistics in both training and eval modes.
            (default: :obj:`False`)
    """

    def __init__(
            self,
            in_channels: int,
            eps: float = 1e-5,
            momentum: float = 0.1,
            affine: bool = False,
            track_running_stats: bool = False,
    ):
        super().__init__(in_channels, eps, momentum, affine,
                         track_running_stats)

    def reset_parameters(self):
        r"""Resets all learnable parameters of the module."""
        super().reset_parameters()

    def forward(self,
                x: Tensor,
                batch: OptTensor = None,
                batch_size: Optional[int] = None) -> Tensor:
        r"""
        Args:
            x (torch.Tensor): The source tensor.
            batch (torch.Tensor, optional): The batch vector
                :math:`\mathbf{b} \in {\{ 0, \ldots, B-1\}}^N`, which assigns
                each element to a specific example. (default: :obj:`None`)
            batch_size (int, optional): The number of examples :math:`B`.
                Automatically calculated if not given. (default: :obj:`None`)
        """
        if batch is None:
            out = F.instance_norm(
                x.t().unsqueeze(0), self.running_mean, self.running_var,
                self.weight, self.bias, self.training
                or not self.track_running_stats, self.momentum, self.eps)
            return out.squeeze(0).t()

        if batch_size is None:
            batch_size = int(batch.max()) + 1

        mean = var = unbiased_var = x  # Dummies.

        if self.training or not self.track_running_stats:
            norm = degree(batch, batch_size, dtype=x.dtype).clamp_(min=1)
            norm = norm.view(-1, 1)
            unbiased_norm = (norm - 1).clamp_(min=1)

            mean = scatter(x, batch, dim=0, dim_size=batch_size,
                           reduce='sum') / norm

            x = x - mean.index_select(0, batch)

            var = scatter(x * x,
                          batch,
                          dim=0,
                          dim_size=batch_size,
                          reduce='sum')
            unbiased_var = var / unbiased_norm
            var = var / norm

            momentum = self.momentum
            if self.running_mean is not None:
                tmp = (1 -
                       momentum) * self.running_mean + momentum * mean.mean(0)
                self.running_mean.copy_(tmp)
            if self.running_var is not None:
                tmp = (1 - momentum
                       ) * self.running_var + momentum * unbiased_var.mean(0)
                self.running_var.copy_(tmp)
        else:
            if self.running_mean is not None:
                mean = self.running_mean.view(1, -1).expand(batch_size, -1)
            if self.running_var is not None:
                var = self.running_var.view(1, -1).expand(batch_size, -1)

            x = x - mean.index_select(0, batch)

        out = x / (var + self.eps).sqrt().index_select(0, batch)

        if self.weight is not None and self.bias is not None:
            out = out * self.weight.view(1, -1) + self.bias.view(1, -1)

        return out

    def __repr__(self) -> str:
        return f'{self.__class__.__name__}({self.num_features})'


================================================
FILE: poptorch_geometric/python/ops/knn.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
#
# This file includes content from PyTorch Cluster which
# has been modified by Graphcore Ltd.
#
# Copyright (c) 2020 Matthias Fey <matthias.fey@tu-dortmund.de>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

from typing import Optional

import torch


def knn(x: torch.Tensor,
        y: torch.Tensor,
        k: int,
        batch_x: Optional[torch.Tensor] = None,
        batch_y: Optional[torch.Tensor] = None,
        *args,
        **kwargs):
    # pylint: disable=unused-argument, keyword-arg-before-vararg
    r"""Finds for each element in `y` the `k` nearest points in `x`.

    Args:
        x (torch.Tensor): Node feature matrix
        y (torch.Tensor): Node feature matrix
        k (int): The number of neighbors.
        batch_x (torch.Tensor, optional): Batch vector which assigns each
            node to a specific example. (default: :obj:`None`)
        batch_y (torch.Tensor, optional): Batch vector which assigns each
            node to a specific example. (default: :obj:`None`)

    :rtype: :class:`LongTensor`

    .. testsetup::

        import torch
        from torch_cluster import knn

    .. testcode::

        >>> x = torch.Tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]])
        >>> batch_x = torch.tensor([0, 0, 0, 0])
        >>> y = torch.Tensor([[-1, 0], [1, 0]])
        >>> batch_y = torch.tensor([0, 0])
        >>> assign_index = knn(x, y, 2, batch_x, batch_y)
    """

    if batch_x is None:
        batch_x = x.new_zeros(x.size(0), dtype=torch.int32)

    if batch_y is None:
        batch_y = y.new_zeros(y.size(0), dtype=torch.int32)

    x = x.view(-1, 1) if x.dim() == 1 else x
    y = y.view(-1, 1) if y.dim() == 1 else y

    assert x.dim() == 2 and batch_x.dim() == 1
    assert y.dim() == 2 and batch_y.dim() == 1
    assert x.size(1) == y.size(1)
    assert x.size(0) == batch_x.size(0)
    assert y.size(0) == batch_y.size(0)

    # Rescale x and y.
    min_xy = torch.min(torch.min(x), torch.min(y))
    x, y = x - min_xy, y - min_xy

    max_xy = torch.max(torch.max(x), torch.max(y))
    x, y, = x / max_xy, y / max_xy

    # Concat batch/features to ensure no cross-links between examples exist.
    x = torch.cat([
        x, 2 * x.size(1) * batch_x.view(
            -1, 1).to(torch.int32 if x.dtype == torch.long else x.dtype)
    ],
                  dim=-1)
    y = torch.cat([
        y, 2 * y.size(1) * batch_y.view(
            -1, 1).to(torch.int32 if y.dtype == torch.long else y.dtype)
    ],
                  dim=-1)

    x_expanded = x.expand(y.size(0), *x.shape)
    y_expanded = y.reshape(y.size(0), 1, y.size(1))

    dist, col = torch.topk(torch.norm(x_expanded - y_expanded, dim=-1),
                           k=k,
                           dim=-1,
                           largest=False,
                           sorted=True)
    row = torch.arange(col.size(0), dtype=torch.long).view(-1, 1).repeat(1, k)

    distance_upper_bound = x.size(1)

    row = torch.where(dist > distance_upper_bound, -1, row).view(-1)
    col = torch.where(dist > distance_upper_bound, -1, col).view(-1)

    return torch.stack([row, col], dim=0)


================================================
FILE: poptorch_geometric/python/ops/knn_graph.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

# This file includes content from PyTorch Cluster which
# has been modified by Graphcore Ltd.
#
# Copyright (c) 2020 Matthias Fey <matthias.fey@tu-dortmund.de>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

import torch

from torch_geometric.typing import OptTensor

from poptorch_geometric.ops.knn import knn


def knn_graph(x: torch.Tensor,
              k: int,
              batch: OptTensor = None,
              loop: bool = False,
              flow: str = 'source_to_target',
              cosine: bool = False,
              num_workers: int = 1) -> torch.Tensor:
    r"""Computes graph edges to the nearest :obj:`k` points.

    .. code-block:: python

        import torch
        from torch_geometric.nn import knn_graph

        x = torch.Tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]])
        batch = torch.tensor([0, 0, 0, 0])
        edge_index = knn_graph(x, k=2, batch=batch, loop=False)

    Args:
        x (torch.Tensor): Node feature matrix
            :math:`\mathbf{X} \in \mathbb{R}^{N \times F}`.
        k (int): The number of neighbors.
        batch (torch.Tensor, optional): Batch vector
            :math:`\mathbf{b} \in {\{ 0, \ldots, B-1\}}^N`, which assigns each
            node to a specific example. (default: :obj:`None`)
        loop (bool, optional): If :obj:`True`, the graph will contain
            self-loops. (default: :obj:`False`)
        flow (str, optional): The flow direction when using in combination with
            message passing (:obj:`"source_to_target"` or
            :obj:`"target_to_source"`). (default: :obj:`"source_to_target"`)
        cosine (bool, optional): If :obj:`True`, will use the cosine
            distance instead of euclidean distance to find nearest neighbors.
            (default: :obj:`False`)
        num_workers (int, optional): Number of workers to use for computation.
            Has no effect in case :obj:`batch` is not :obj:`None`, or the input
            lies on the GPU. (default: :obj:`1`)

    :rtype: :class:`torch.Tensor`
    """

    assert flow in ['source_to_target', 'target_to_source']
    edge_index = knn(x, x, k if loop else k + 1, batch, batch, cosine,
                     num_workers)

    if flow == 'source_to_target':
        row, col = edge_index[1], edge_index[0]
    else:
        row, col = edge_index[0], edge_index[1]

    if not loop:
        mask = row != col
        row, col = row[mask], col[mask]

    return torch.stack([row, col], dim=0)


================================================
FILE: poptorch_geometric/python/ops/knn_interpolate.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
#
# This file includes content from PyTorch Geometric which
# has been modified by Graphcore Ltd.
#
# Copyright (c) 2023 PyG Team <team@pyg.org>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

import torch

from torch_geometric.typing import OptTensor
from torch_geometric.utils import scatter

from poptorch_geometric.ops.knn import knn


def knn_interpolate(x: torch.Tensor,
                    pos_x: torch.Tensor,
                    pos_y: torch.Tensor,
                    batch_x: OptTensor = None,
                    batch_y: OptTensor = None,
                    k: int = 3,
                    num_workers: int = 1,
                    *args,
                    **kwargs):
    # pylint: disable=unused-argument, keyword-arg-before-vararg
    r"""The k-NN interpolation from the `"PointNet++: Deep Hierarchical
    Feature Learning on Point Sets in a Metric Space"
    <https://arxiv.org/abs/1706.02413>`_ paper.
    For each point :math:`y` with position :math:`\mathbf{p}(y)`, its
    interpolated features :math:`\mathbf{f}(y)` are given by
    .. math::
        \mathbf{f}(y) = \frac{\sum_{i=1}^k w(x_i) \mathbf{f}(x_i)}{\sum_{i=1}^k
        w(x_i)} \textrm{, where } w(x_i) = \frac{1}{d(\mathbf{p}(y),
        \mathbf{p}(x_i))^2}
    and :math:`\{ x_1, \ldots, x_k \}` denoting the :math:`k` nearest points
    to :math:`y`.
    Args:
        x (torch.Tensor): Node feature matrix
            :math:`\mathbf{X} \in \mathbb{R}^{N \times F}`.
        pos_x (torch.Tensor): Node position matrix
            :math:`\in \mathbb{R}^{N \times d}`.
        pos_y (torch.Tensor): Upsampled node position matrix
            :math:`\in \mathbb{R}^{M \times d}`.
        batch_x (torch.Tensor, optional): Batch vector
            :math:`\mathbf{b_x} \in {\{ 0, \ldots, B-1\}}^N`, which assigns
            each node from :math:`\mathbf{X}` to a specific example.
            (default: :obj:`None`)
        batch_y (torch.Tensor, optional): Batch vector
            :math:`\mathbf{b_y} \in {\{ 0, \ldots, B-1\}}^N`, which assigns
            each node from :math:`\mathbf{Y}` to a specific example.
            (default: :obj:`None`)
        k (int, optional): Number of neighbors. (default: :obj:`3`)
        num_workers (int, optional): Number of workers to use for computation.
            Has no effect in case :obj:`batch_x` or :obj:`batch_y` is not
            :obj:`None`, or the input lies on the GPU. (default: :obj:`1`)
    """

    with torch.no_grad():
        assign_index = knn(pos_x,
                           pos_y,
                           k,
                           batch_x=batch_x,
                           batch_y=batch_y,
                           num_workers=num_workers)
        y_idx, x_idx = assign_index[0], assign_index[1]

        extended_y_idx = torch.where(y_idx == -1, pos_y.size(0), y_idx)
        extended_x_idx = torch.where(x_idx == -1, pos_x.size(0), x_idx)
        posx_zeros = torch.zeros_like(pos_x[:1], dtype=pos_x.dtype)
        extended_diff_x = torch.cat((pos_x, posx_zeros))
        posy_zeros = torch.zeros_like(pos_y[:1], dtype=pos_y.dtype)
        extended_diff_y = torch.cat((pos_y, posy_zeros))

        x_zeros = torch.zeros_like(x[:1], dtype=x.dtype)
        extended_x = torch.cat((x, x_zeros))

        diff = torch.index_select(extended_diff_x, 0,
                                  extended_x_idx) - torch.index_select(
                                      extended_diff_y, 0, extended_y_idx)
        squared_distance = (diff * diff).sum(dim=-1, keepdim=True)
        weights = 1.0 / torch.clamp(squared_distance, min=1e-16)

    scatter_y_idx = torch.arange(pos_y.size(0),
                                 dtype=torch.long).view(-1,
                                                        1).repeat(1,
                                                                  k).view(-1)
    y = scatter(torch.index_select(extended_x, 0, extended_x_idx) * weights,
                scatter_y_idx,
                0,
                pos_y.size(0),
                reduce='sum')
    return y / scatter(weights, scatter_y_idx, 0, pos_y.size(0), reduce='sum')


================================================
FILE: poptorch_geometric/python/ops/mf_conv.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
#
# This file includes content from PyTorch Geometric which
# has been modified by Graphcore Ltd.
#
# Copyright (c) 2023 PyG Team <team@pyg.org>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

from typing import Union

import torch
import torch_geometric
from torch_geometric.typing import Adj, OptPairTensor, Size, SparseTensor
from torch_geometric.utils import degree
from torch import Tensor


# pylint: disable=abstract-method
class MFConv(torch_geometric.nn.conv.MFConv):
    r"""The graph neural network operator from the
    `"Convolutional Networks on Graphs for Learning Molecular Fingerprints"
    <https://arxiv.org/abs/1509.09292>`_ paper

    .. math::
        \mathbf{x}^{\prime}_i = \mathbf{W}^{(\deg(i))}_1 \mathbf{x}_i +
        \mathbf{W}^{(\deg(i))}_2 \sum_{j \in \mathcal{N}(i)} \mathbf{x}_j

    which trains a distinct weight matrix for each possible vertex degree.

    Args:
        in_channels (int or tuple): Size of each input sample, or :obj:`-1` to
            derive the size from the first input(s) to the forward method.
            A tuple corresponds to the sizes of source and target
            dimensionalities.
        out_channels (int): Size of each output sample.
        max_degree (int, optional): The maximum node degree to consider when
            updating weights (default: :obj:`10`)
        bias (bool, optional): If set to :obj:`False`, the layer will not learn
            an additive bias. (default: :obj:`True`)
        **kwargs (optional): Additional arguments of
            :class:`torch_geometric.nn.conv.MessagePassing`.

    Shapes:
        - **inputs:**
          node features :math:`(|\mathcal{V}|, F_{in})` or
          :math:`((|\mathcal{V_s}|, F_{s}), (|\mathcal{V_t}|, F_{t}))`
          if bipartite,
          edge indices :math:`(2, |\mathcal{E}|)`
        - **outputs:** node features :math:`(|\mathcal{V}|, F_{out})` or
          :math:`(|\mathcal{V_t}|, F_{out})` if bipartite
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def forward(self,
                x: Union[Tensor, OptPairTensor],
                edge_index: Adj,
                size: Size = None) -> Tensor:

        if isinstance(x, Tensor):
            x: OptPairTensor = (x, x)
        x_r = x[1]

        deg = x[0]  # Dummy.
        if isinstance(edge_index, SparseTensor):
            deg = edge_index.storage.rowcount()
        elif isinstance(edge_index, Tensor):
            i = 1 if self.flow == 'source_to_target' else 0
            N = x[0].size(self.node_dim)
            N = size[1] if size is not None else N
            N = x_r.size(self.node_dim) if x_r is not None else N
            deg = degree(edge_index[i], N, dtype=torch.long)
        deg.clamp_(max=self.max_degree)

        # propagate_type: (x: OptPairTensor)
        h = self.propagate(edge_index, x=x, size=size)

        out = h.new_empty(list(h.size())[:-1] + [self.out_channels])
        for i, (lin_l, lin_r) in enumerate(zip(self.lins_l, self.lins_r)):
            mask = torch.eq(deg, i).view(-1, 1)
            h_deg_i = torch.where(mask, h, 0.0)
            r = lin_l(h_deg_i)

            if x_r is not None:
                x_r_deg_i = torch.where(mask, x_r, 0.0)
                r = r + lin_r(x_r_deg_i)

            out = torch.where(mask, r, out)

        return out


================================================
FILE: poptorch_geometric/python/ops/radius.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
#
# This file includes content from PyTorch Cluster which
# has been modified by Graphcore Ltd.
#
# Copyright (c) 2020 Matthias Fey <matthias.fey@tu-dortmund.de>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

import torch

from torch import Tensor
from torch_geometric.typing import OptTensor


def radius(
        x: Tensor,
        y: Tensor,
        r: float,
        batch_x: OptTensor = None,
        batch_y: OptTensor = None,
        max_num_neighbors: int = 32,
        *args,
        **kwargs,
) -> Tensor:
    # pylint: disable=unused-argument, keyword-arg-before-vararg
    r"""Computes graph edges to all points within a given distance.

    .. code-block:: python

        import torch
        from torch_geometric.nn import radius_graph

        x = torch.Tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]])
        batch = torch.tensor([0, 0, 0, 0])
        edge_index = radius_graph(x, r=1.5, batch=batch, loop=False)

    Args:
        x (torch.Tensor): Node feature matrix
            :math:`\mathbf{X} \in \mathbb{R}^{N \times F}`.
        r (float): The radius.
        batch (torch.Tensor, optional): Batch vector
            :math:`\mathbf{b} \in {\{ 0, \ldots, B-1\}}^N`, which assigns each
            node to a specific example. (default: :obj:`None`)
        loop (bool, optional): If :obj:`True`, the graph will contain
            self-loops. (default: :obj:`False`)
        max_num_neighbors (int, optional): The maximum number of neighbors to
            return for each element in :obj:`y`. (default: :obj:`32`)
        flow (str, optional): The flow direction when using in combination with
            message passing (:obj:`"source_to_target"` or
            :obj:`"target_to_source"`). (default: :obj:`"source_to_target"`)
        num_workers (int, optional): Number of workers to use for computation.
            Has no effect in case :obj:`batch` is not :obj:`None`, or the input
            lies on the GPU. (default: :obj:`1`)
        batch_size (int, optional): The number of examples :math:`B`.
            Automatically calculated if not given. (default: :obj:`None`)

    :rtype: :class:`torch.Tensor` with static shape, where not found neighbours
            are marked by -1
    """
    if batch_x is None:
        batch_x = x.new_zeros(x.size(0), dtype=torch.long)

    if batch_y is None:
        batch_y = y.new_zeros(y.size(0), dtype=torch.long)

    if not torch.is_floating_point(x):
        x = x.float()

    if not torch.is_floating_point(y):
        y = y.float()

    x = x.view(-1, 1) if x.dim() == 1 else x
    y = y.view(-1, 1) if y.dim() == 1 else y

    assert x.dim() == 2 and batch_x.dim() == 1
    assert y.dim() == 2 and batch_y.dim() == 1
    assert x.size(1) == y.size(1)
    assert x.size(0) == batch_x.size(0)
    assert y.size(0) == batch_y.size(0)

    x = torch.cat([x, 2 * r * batch_x.view(-1, 1).to(x.dtype)], dim=-1)
    y = torch.cat([y, 2 * r * batch_y.view(-1, 1).to(y.dtype)], dim=-1)
    distance_upper_bound = r + 1e-8

    dist = torch.cdist(y, x)
    k = min(dist.size(-1), max_num_neighbors)
    dist, col = torch.topk(dist, k=k, dim=-1, largest=False, sorted=True)
    row = torch.cat([torch.full(col.shape[1:], i) for i in range(col.size(0))],
                    dim=0)
    col = torch.where(dist < distance_upper_bound, col, -1)
    col = torch.flatten(col)
    row = torch.where(col == -1, -1, row)

    return torch.stack([row, col], dim=0)


def radius_graph(
        x: torch.Tensor,
        r: float,
        batch: OptTensor = None,
        loop: bool = False,
        max_num_neighbors: int = 32,
        flow: str = 'source_to_target',
        *args,
        **kwargs,
) -> torch.Tensor:
    # pylint: disable=unused-argument, keyword-arg-before-vararg
    r"""Computes graph edges to all points within a given distance.

    Args:
        x (Tensor): Node feature matrix
            :math:`\mathbf{X} \in \mathbb{R}^{N \times F}`.
        r (float): The radius.
        batch (LongTensor, optional): Batch vector
            :math:`\mathbf{b} \in {\{ 0, \ldots, B-1\}}^N`, which assigns each
            node to a specific example. :obj:`batch` needs to be sorted.
            (default: :obj:`None`)
        loop (bool, optional): If :obj:`True`, the graph will contain
            self-loops. (default: :obj:`False`)
        max_num_neighbors (int, optional): The maximum number of neighbors to
            return for each element.
            If the number of actual neighbors is greater than
            :obj:`max_num_neighbors`, returned neighbors are picked randomly.
            (default: :obj:`32`)
        flow (string, optional): The flow direction when used in combination
            with message passing (:obj:`"source_to_target"` or
            :obj:`"target_to_source"`). (default: :obj:`"source_to_target"`)
        num_workers (int): Number of workers to use for computation. Has no
            effect in case :obj:`batch` is not :obj:`None`, or the input lies
            on the GPU. (default: :obj:`1`)
        batch_size (int, optional): The number of examples :math:`B`.
            Automatically calculated if not given. (default: :obj:`None`)

    :rtype: :class:`LongTensor`

    .. code-block:: python

        import torch
        from torch_cluster import radius_graph

        x = torch.Tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]])
        batch = torch.tensor([0, 0, 0, 0])
        edge_index = radius_graph(x, r=1.5, batch=batch, loop=False)
    """

    assert flow in ['source_to_target', 'target_to_source']
    edge_index = radius(x, x, r, batch, batch,
                        max_num_neighbors if loop else max_num_neighbors + 1,
                        *args, **kwargs)
    if flow == 'source_to_target':
        row, col = edge_index[1], edge_index[0]
    else:
        row, col = edge_index[0], edge_index[1]

    if not loop:
        mask = row != col
        row, col = row[mask], col[mask]

    return torch.stack([row, col], dim=0)


================================================
FILE: poptorch_geometric/python/override.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import functools
import importlib

import torch_geometric
import poptorch
from poptorch_geometric import ops

from poptorch_geometric.common import call_once


class _TorchGeometricOpsSubstitutionManager:

    subsitutions = {
        torch_geometric.nn: {
            "knn_interpolate": ops.knn_interpolate
        },
        torch_geometric.nn.aggr.base.Aggregation: {
            "assert_sorted_index": ops.Aggregation.assert_sorted_index
        },
        torch_geometric.nn.ClusterGCNConv: {
            "forward": ops.ClusterGCNConv.forward
        },
        torch_geometric.nn.conv.edge_conv: {  # pylint: disable=no-member
            "knn": ops.knn
        },
        torch_geometric.nn.conv.gravnet_conv: {  # pylint: disable=no-member
            "knn": ops.knn
        },
        torch_geometric.nn.conv.x_conv: {  # pylint: disable=no-member
            "knn_graph": ops.knn_graph
        },
        torch_geometric.nn.dense.HeteroLinear: {
            "forward": ops.HeteroLinear.forward
        },
        torch_geometric.nn.InstanceNorm: {
            "forward": ops.InstanceNorm.forward
        },
        torch_geometric.nn.conv.MFConv: {
            "forward": ops.MFConv.forward
        },
        torch_geometric.nn.unpool: {
            "knn_interpolate": ops.knn_interpolate
        },
        torch_geometric.nn.pool: {
            "knn": ops.knn,
            "knn_graph": ops.knn_graph,
            "radius": ops.radius,
            "radius_graph": ops.radius_graph,
        }
    }

    def __init__(self):
        self.overrides = {}

    def __enter__(self):
        self.replace()
        return self

    def __exit__(self, exc_type, value, traceback):
        self.restore()

    def replace(self):
        torch_geometric.experimental.set_experimental_mode(
            True, 'disable_dynamic_shapes')

        def create_wrapper(f, replacement_f):
            @functools.wraps(f)
            def _wrapper(*args, **kwargs):
                return replacement_f(*args, **kwargs)

            return _wrapper

        for mod, replacement_map in self.subsitutions.items():
            for op_name, replacement in replacement_map.items():
                func = getattr(mod, op_name)
                self.overrides.setdefault(mod, {})[op_name] = func
                setattr(mod, op_name, create_wrapper(func, replacement))

    def restore(self):
        for mod, replacement_map in self.overrides.items():
            for op_name, func in replacement_map.items():
                setattr(mod, op_name, func)

        torch_geometric.experimental.set_experimental_mode(
            False, 'disable_dynamic_shapes')


@call_once
def registerOptionalOverrides():
    torch_cluster_spec = importlib.util.find_spec("torch_cluster")
    if torch_cluster_spec is not None:
        loader = torch_cluster_spec.loader
        if loader is not None:
            torch_cluster = loader.load_module()
            torch_cluster_overrides = \
                _TorchGeometricOpsSubstitutionManager.subsitutions.setdefault(
                    torch_cluster, {})
            torch_cluster_overrides["knn"] = ops.knn
            torch_cluster_overrides["knn_graph"] = ops.knn_graph
            torch_cluster_overrides["radius"] = ops.radius
            torch_cluster_overrides["radius_graph"] = ops.radius_graph
            torch_cluster_overrides["nearest"] = poptorch.nearest


registerOptionalOverrides()


================================================
FILE: poptorch_geometric/python/py.typed
================================================
# Marker file for PEP 561.


================================================
FILE: poptorch_geometric/python/pyg_cluster_loader.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
from typing import Optional

import torch
from torch_geometric.loader import ClusterData, ClusterLoader

from poptorch_geometric.collate import FixedSizeCollater
from poptorch_geometric.fixed_size_options import FixedSizeOptions
from poptorch_geometric.pyg_dataloader import OverSizeStrategy


class FixedSizeClusterLoader(torch.utils.data.DataLoader):
    r"""A data loader which merges data objects from a
    :class:`torch_geometric.loader.ClusterData` to a mini-batch of clusters
    and pads node and edge features so tensors across all batches have constant
    shapes.

    Args:
        cluster_data (ClusterData): The cluster from which to load the data.
        fixed_size_options (FixedSizeOptions, optional): A
            :py:class:`poptorch_geometric.fixed_size_options.FixedSizeOptions`
            object which holds the maximum number of nodes, edges and other
            options required to pad the mini-batches, produced by the data
            loader, to a fixed size.
        batch_size (int, optional): The number of nodes per mini-batch to
            load.
            (default: :obj:`1`)
        over_size_strategy (OverSizeStrategy, optional): The
            behaviour if a sample cannot fit in the fixed-size mini-batch.
            By default, if the required number of samples cannot fit into the
            fixed-sized mini-batch, nodes and edges will be removed from the
            mini-batch to achieve the specified fixed size.
            (default: `poptorch_geometric.OverSizeStrategy.TrimNodesAndEdges`)
        add_pad_masks  (bool, optional): If :obj:`True`, mask objects
            are attached to mini-batch result. They represents three levels of
            padding:

            - :obj:`graphs_mask` - graph level mask
            - :obj:`nodes_mask`  - node level mask
            - :obj:`edges_mask`  - edge level mask

            Mask objects indicate which elements in the mini-batch are real
            (represented by :obj:`True`) and which were added as
            padding (represented by :obj:`False`).
            (default: :obj:`True`)
        **kwargs (optional): The additional arguments of
            :class:`torch.utils.data.DataLoader`.
    """

    def __init__(
            self,
            cluster_data: ClusterData,
            fixed_size_options: FixedSizeOptions,
            batch_size: int = 1,
            over_size_strategy: OverSizeStrategy = OverSizeStrategy.
            TrimNodesAndEdges,
            add_pad_masks: Optional[bool] = True,
            **kwargs,
    ):
        assert fixed_size_options.num_graphs == 2, (
            "The number of graphs in a batch specified by the fixed sized"
            f" options must be 2 when using the {self.__class__.__name__},"
            " currently it is set to"
            f" {fixed_size_options.num_graphs}")

        unsupported = set(kwargs).intersection(
            {'collate_fn', 'batch_sampler', 'shuffle', 'exclude_keys'})
        assert not unsupported, \
            '`FixedSizeClusterLoader` does not support the following ' \
            f'arguments: {unsupported}.'

        self.cluster_data = cluster_data
        self.batch_size = batch_size

        collater = self._create_collater(
            fixed_size_options=fixed_size_options,
            add_masks_to_batch=add_pad_masks,
            trim_nodes=(
                over_size_strategy in (OverSizeStrategy.TrimNodes,
                                       OverSizeStrategy.TrimNodesAndEdges)),
            trim_edges=(
                over_size_strategy in (OverSizeStrategy.TrimEdges,
                                       OverSizeStrategy.TrimNodesAndEdges)))

        super().__init__(dataset=range(len(cluster_data)),
                         batch_size=batch_size,
                         collate_fn=collater,
                         **kwargs)

    def _collate(self, batch):
        batch = self.cluster_collater(batch)
        batch = self.fixed_size_collater([batch])
        return batch

    def _create_collater(self, **collater_args):
        cluster_loader = ClusterLoader(self.cluster_data,
                                       batch_size=self.batch_size)
        self.cluster_collater = cluster_loader._collate  # pylint: disable=protected-access
        self.fixed_size_collater = FixedSizeCollater(**collater_args)

        return self._collate


================================================
FILE: poptorch_geometric/python/pyg_collate.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

from torch_geometric.loader.dataloader import Collater as PyGCollater


# TODO: Upstream that change (default arguments) to PyG when upstreaming
# DataLoaders.
class Collater(PyGCollater):
    def __init__(self, follow_batch=None, exclude_keys=None):
        follow_batch = follow_batch or []
        exclude_keys = exclude_keys or []
        super().__init__(follow_batch, exclude_keys)


================================================
FILE: poptorch_geometric/python/pyg_dataloader.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
# Note: The content of this file is going to be upstreamed to PyG.
from enum import Enum
from typing import List, Optional, Sequence, Tuple, Union

import torch.utils.data
from torch.utils.data.sampler import RandomSampler, SequentialSampler
from torch_geometric.data import Dataset, HeteroData
from torch_geometric.data.data import BaseData

from poptorch_geometric.collate import FixedSizeCollater
from poptorch_geometric.fixed_size_options import FixedSizeOptions
from poptorch_geometric.pyg_collate import Collater
from poptorch_geometric.stream_packing_sampler import StreamPackingSampler


class FixedSizeStrategy(Enum):
    """Specify the strategy to use to achieve fixed-size mini-batches.

    - ``PadToMax``: Each mini-batch will contain a fixed number of real
      graphs plus one single graph for padding.
    - ``StreamPack``: If the next sample to batch can fit in the mini-batch
      it will be added. This results in mini-batches with a varied number
      of real graphs, but minimises the amount of wasted space in a
      mini-batch due to padding.
    """
    PadToMax = 0
    StreamPack = 1


class OverSizeStrategy(Enum):
    """Specify the behaviour if a sample cannot fit in the fixed-size
    mini-batch.

    - ``Error``:  If the required number of samples cannot fit into a
      mini-batch, an error will be thrown.
    - ``Skip``: If the required number of samples cannot fit into a
      mini-batch, the samples that cannot fit will be skipped.
    - ``TrimNodes``: If the required number of samples cannot fit into a
      mini-batch, the samples will still be added and then nodes will be
      removed from the mini-batch to achieve the fixed size. Enabling this
      can cause a loss of information in the samples of the mini-batch.
    - ``TrimEdges``: If the required number of samples cannot fit into a
      mini-batch, the samples will still be added and then edges will be
      removed from the mini-batch to achieve the fixed size. Enabling this
      can cause a loss of information in the samples of the mini-batch.
    - ``TrimNodesAndEdges``: If the required number of samples cannot fit
      into a mini-batch, the samples will still be added and then both
      nodes and edges will be removed from the mini-batch to achieve the
      fixed size. Enabling this can cause a loss of information in the
      samples of the mini-batch.
    """
    Error = 0
    Skip = 1
    TrimNodes = 2
    TrimEdges = 3
    TrimNodesAndEdges = 4


# ==== Copied from PyG and changed to have `_create_collater` method and
# pass arguments to `__init__`` as keyword ones.
class DataLoader(torch.utils.data.DataLoader):
    r"""A data loader which merges data objects from a
    :class:`torch_geometric.data.Dataset` to a mini-batch.
    Data objects can be either of type :class:`~torch_geometric.data.Data` or
    :class:`~torch_geometric.data.HeteroData`.

    Args:
        dataset (Dataset): The dataset from which to load the data.
        batch_size (int, optional): How many samples per batch to load.
            (default: :obj:`1`)
        shuffle (bool, optional): If set to :obj:`True`, the data will be
            reshuffled at every epoch. (default: :obj:`False`)
        follow_batch (List[str], optional): Creates assignment batch
            vectors for each key in the list. (default: :obj:`None`)
        exclude_keys (List[str], optional): Will exclude each key in the
            list. (default: :obj:`None`)
        **kwargs (optional): Additional arguments of
            :class:`torch.utils.data.DataLoader`.
    """

    def __init__(
            self,
            dataset: Union[Dataset, Sequence[BaseData]],
            batch_size: int = 1,
            shuffle: bool = False,
            follow_batch: Optional[List[str]] = None,
            exclude_keys: Optional[List[str]] = None,
            **kwargs,
    ):

        if 'collate_fn' in kwargs:
            del kwargs['collate_fn']

        # Save for PyTorch Lightning < 1.6:
        self.follow_batch = follow_batch
        self.exclude_keys = exclude_keys

        collater = self._create_collater(follow_batch=follow_batch,
                                         exclude_keys=exclude_keys)
        super().__init__(
            dataset=dataset,
            batch_size=batch_size,
            shuffle=shuffle,
            collate_fn=collater,
            **kwargs,
        )

    def _create_collater(self, **collater_args):
        return Collater(**collater_args)


# ==== End of copied code


class FixedSizeDataLoader(torch.utils.data.DataLoader):
    r"""A data loader which merges data objects from
    :class:`torch_geometric.data.Dataset` to a mini-batch and pads node and
    edge features so tensors across all batches have the same shapes.

    Data objects can be either of type :py:class:`~torch_geometric.data.Data` or
    :py:class:`~torch_geometric.data.HeteroData`.

    Args:
        dataset (Dataset): The :class:`~torch_geometric.data.Dataset` instance
            from which to load the graph samples.
        batch_size (int, optional): The number of graph samples to load in each
            mini-batch. This should be at least :obj:`2` to allow for creating
            at least one padding graph. (default: :obj:`2`)
        shuffle (bool, optional): If :obj:`True`, the data will be
            reshuffled at every epoch. (default: :obj:`False`)
        fixed_size_options (FixedSizeOptions, optional): A
            :py:class:`poptorch_geometric.fixed_size_options.FixedSizeOptions`
            object which holds the maximum number of nodes, edges and other
            options required to pad the mini-batches, produced by the data
            loader, to a fixed size. If not specified, this will be determined
            from the provided dataset. (default: :obj:`None`)
        fixed_size_strategy (FixedSizeStrategy, optional): The
            strategy to use to achieve fixed-size mini-batches. By default,
            each mini-batch will contain a fixed number of real graphs
            (`batch_size` - 1) plus one single graph for padding.
            (default: `poptorch_geometric.FixedSizeStrategy.PadToMax`)
        over_size_strategy (OverSizeStrategy, optional): The
            behaviour if a sample cannot fit in the fixed-size mini-batch.
            By default, if the required number of samples cannot fit into the
            fixed-sized batch an error will be raised.
            (default: `poptorch_geometric.OverSizeStrategy.Error`)
        add_pad_masks  (bool, optional): If :obj:`True`, mask objects
            are attached to mini-batch result. They represents three levels of
            padding:

            - :obj:`graphs_mask` - graph level mask
            - :obj:`nodes_mask`  - node level mask
            - :obj:`edges_mask`  - edge level mask

            Mask objects indicate which elements in the mini-batch are real
            (represented by :obj:`True`) and which were added as
            padding (represented by :obj:`False`).
            (default: :obj:`True`)
        follow_batch (list or tuple, optional): Creates assignment batch
            vectors for each key in the list. (default: :obj:`None`)
        exclude_keys (list or tuple, optional): Keys to exclude from the
            batch. (default: :obj:`None`)
        **kwargs (optional): Additional arguments of
            :class:`torch.utils.data.DataLoader`.
    """

    def __init__(
            self,
            dataset: Dataset,
            batch_size: int = 2,
            shuffle: bool = False,
            fixed_size_options: Optional[FixedSizeOptions] = None,
            fixed_size_strategy: FixedSizeStrategy = FixedSizeStrategy.
            PadToMax,
            over_size_strategy: OverSizeStrategy = OverSizeStrategy.Error,
            add_pad_masks: Optional[bool] = True,
            follow_batch: Optional[Union[List[str], Tuple[str, ...]]] = None,
            exclude_keys: Optional[Union[List[str], Tuple[str, ...]]] = None,
            **kwargs,
    ) -> None:

        if fixed_size_options is None:
            self.fixed_size_options = FixedSizeOptions.from_dataset(
                dataset, batch_size)
        else:
            self.fixed_size_options = fixed_size_options

        if (isinstance(dataset[0], HeteroData)
                and not self.fixed_size_options.is_hetero()):
            self.fixed_size_options.to_hetero(dataset[0].node_types,
                                              dataset[0].edge_types)

        assert batch_size == self.fixed_size_options.num_graphs, (
            "`num_graphs` in fixed size options must match"
            " provided batch size in dataloader. `num_graphs`"
            f" is {self.fixed_size_options.num_graphs} but batch"
            f" size is {batch_size}.")
        self.padded_batch_size = batch_size

        batch_sampler = kwargs.pop("batch_sampler", None)

        if fixed_size_strategy == FixedSizeStrategy.StreamPack:
            if batch_sampler is not None:
                raise ValueError(
                    f"Fixed size strategy {fixed_size_strategy} is"
                    " incompatible with the provided batch_sampler"
                    f" {batch_sampler}. Either use a different strategy"
                    " or set `batch_sampler` to `None`.")
            base_sampler = RandomSampler(
                dataset) if shuffle else SequentialSampler(dataset)

            # Leave space for padding.
            sampler_graphs = batch_size - 1
            sampler_nodes = fixed_size_options.total_num_nodes - 1
            sampler_edges = fixed_size_options.total_num_edges - 1
            batch_sampler = StreamPackingSampler(
                dataset,
                sampler_graphs,
                max_num_nodes=sampler_nodes,
                max_num_edges=sampler_edges,
                base_sampler=base_sampler,
                allow_skip_data=(over_size_strategy == OverSizeStrategy.Skip))
        elif fixed_size_strategy != FixedSizeStrategy.PadToMax:
            raise NotImplementedError(
                f"Fixed size strategy {fixed_size_strategy} is not a supported"
                f" strategy for {self.__class__.__name__}")

        if batch_sampler is not None:
            # The `torch.DataLoader` class expects batch size to be `1`
            # and shuffle to be `None` when `batch_sampler` is provided.
            torch_dataloader_batch_size = 1
            shuffle = None
        else:
            torch_dataloader_batch_size = batch_size - 1

        self.batch_sampler = batch_sampler

        assert 'collate_fn' not in kwargs, \
            f'Cannot set `collate_fn` with `{self.__class__.__name__}`. ' \
            'Consider attaching a torch_geometric.transform.Pad transform' \
            ' after  your collate_fn and use with' \
            ' `torch.utils.dataloader.DataLoader`  to achieve fixed sized' \
            ' batches.'

        collater = self._create_collater(
            fixed_size_options=self.fixed_size_options,
            add_masks_to_batch=add_pad_masks,
            trim_nodes=(
                over_size_strategy in (OverSizeStrategy.TrimNodes,
                                       OverSizeStrategy.TrimNodesAndEdges)),
            trim_edges=(
                over_size_strategy in (OverSizeStrategy.TrimEdges,
                                       OverSizeStrategy.TrimNodesAndEdges)),
            follow_batch=follow_batch,
            exclude_keys=exclude_keys)

        super().__init__(dataset=dataset,
                         batch_size=torch_dataloader_batch_size,
                         shuffle=shuffle,
                         batch_sampler=batch_sampler,
                         collate_fn=collater,
                         **kwargs)

    def _create_collater(self, **collater_args):
        return FixedSizeCollater(**collater_args)


================================================
FILE: poptorch_geometric/python/stream_packing_sampler.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.

from functools import lru_cache
from typing import Iterable, Iterator, List, Optional, Union

from torch.utils.data.sampler import RandomSampler, Sampler, SequentialSampler
from torch_geometric.data import Dataset
from torch_geometric.data.data import BaseData

__all__ = ['StreamPackingSampler']


class StreamPackingSampler(Sampler[List[int]]):
    r"""Wraps a sampler to generate a mini-batch of graphs with potentially
    varying batch sizes.
    :py:class:`StreamPackingSampler` creates batches by adding one graph at
    a time to the batch one at a time without exceeding the maximum number
    of nodes, edges, or graphs. This gives similar results to packing without
    requiring the dataset to be preprocessed.

    Args:
        data_source (torch_geometric.data.Dataset): The data source to
            process.
        max_num_graphs (int): The maximum number of graphs to include in a
            batch.
        max_num_nodes (int, optional): The maximum number of nodes allowed in a
            batch. (default: :obj:`None`)
        max_num_edges (int, optional): The maximum number of edges allowed in a
            batch. (default: :obj:`None`)
        base_sampler (Sampler or Iterable, optional): The base sampler used to
            sample the graphs before packing them into a batch. This can be any
            iterable object. (default: SequentialSampler(data_source))
        allow_skip_data (bool, optional): If true, allows for a skip
            :obj:`data_source` item to be skipped. Otherwise, a
            :py:exc:`RuntimeError` will be thrown when the sampler is not able
            to form a single item batch from :obj:`data_source`, because
            the iterated data exceeds the maximum batch requirements.
            (default :obj:`False`)
    """

    def __init__(
            self,
            data_source: Dataset,
            max_num_graphs: int,
            max_num_nodes: Optional[int] = None,
            max_num_edges: Optional[int] = None,
            base_sampler: Optional[Union[Sampler[int], Iterable[int]]] = None,
            allow_skip_data: Optional[bool] = False) -> None:
        super().__init__(data_source)
        self._validate(base_sampler, max_num_nodes, max_num_edges,
                       max_num_graphs)

        self.data_source = data_source
        self.max_num_graphs = max_num_graphs

        self.max_num_nodes = max_num_nodes
        if max_num_nodes is None:
            self.max_num_nodes = max(data.num_nodes
                                     for data in data_source) * max_num_graphs

        self.max_num_edges = max_num_edges
        if max_num_edges is None:
            self.max_num_edges = max(data.num_edges
                                     for data in data_source) * max_num_graphs

        self.base_sampler = base_sampler if base_sampler is not None else \
            SequentialSampler(data_source)
        self.allow_skip_data = allow_skip_data

    def _validate(self, sampler, max_num_nodes, max_num_edges, max_num_graphs):
        if sampler is not None and len(sampler) == 0:
            raise ValueError(
                f'The `sampler` {sampler} provided is invalid,'
                ' the length of the sampler must be greater than 0.')

        def validate_batch_limit(param, param_name, limit=1):
            if param is not None and param < limit:
                raise ValueError(
                    f'Invalid value for `{param_name}` parameter, '
                    f'{param_name} should be at least greater '
                    f' than {limit}.')

        if max_num_graphs is None:
            raise ValueError('Invalid value for `max_num_graphs` parameter.'
                             ' `max_num_graphs` must be an integer of at least'
                             ' 1, it is None.')

        validate_batch_limit(max_num_graphs, 'max_num_graphs', 1)
        validate_batch_limit(max_num_nodes, 'max_num_nodes', max_num_graphs)
        validate_batch_limit(max_num_edges, 'max_num_edges', max_num_graphs)

    class _Batch:
        def __init__(self) -> None:
            self.indices: List[int] = []
            self.num_nodes = 0
            self.num_edges = 0
            self.num_graphs = 0

        def append(self, idx: int, data: BaseData) -> None:
            self.indices.append(idx)
            self.num_nodes += data.num_nodes
            self.num_edges += data.num_edges
            self.num_graphs += 1

        def empty(self) -> bool:
            return len(self.indices) == 0

        def __repr__(self) -> str:
            return f'Batch{{ indices: {self.indices}, ' \
                   f'num_nodes: {self.num_nodes}, ' \
                   f'num_edges: {self.num_edges}, ' \
                   f'num_graphs: {self.num_graphs} }}'

    def __iter__(self) -> Iterator[List[int]]:
        batch = self._Batch()
        for idx in self.base_sampler:
            data = self.data_source[idx]
            is_data_appendable = True

            while True:
                if self._has_space(batch, data):
                    batch.append(idx, data)
                elif not batch.empty():
                    yield batch.indices
                    batch = self._Batch()
                    continue
                else:
                    is_data_appendable = False

                if not self.allow_skip_data and not is_data_appendable:
                    raise RuntimeError(
                        'The maximum number of graphs, nodes or edges'
                        ' specified is too small to fit in the single sample'
                        f' {idx} with {data.num_nodes} nodes and'
                        f' {data.num_edges} edges. The maximum number of graphs'
                        f' specified is {self.max_num_graphs}, the maximum'
                        f' number of nodes is {self.max_num_nodes} and the'
                        f' maximum number of edges is {self.max_num_edges}.'
                        ' If this is intended, use `allow_skip_data` to'
                        ' enable this sample to be completely skipped'
                        f' from batching. The sample is {data}.')
                break

        if not batch.empty():
            yield batch.indices

    def _has_space(self, batch: _Batch, data: BaseData) -> bool:
        next_nodes = data.num_nodes
        next_edges = data.num_edges

        nodes_left = self.max_num_nodes - (batch.num_nodes + next_nodes)
        edges_left = self.max_num_edges - (batch.num_edges + next_edges)
        graphs_left = self.max_num_graphs - (batch.num_graphs + 1)

        graph_fits = nodes_left >= 0 and edges_left >= 0 and \
            graphs_left >= 0
        has_space_for_padding = nodes_left >= graphs_left and \
            edges_left >= graphs_left

        has_space = graph_fits and has_space_for_padding
        return has_space

    @lru_cache(maxsize=128)
    def __len__(self) -> int:
        if isinstance(self.base_sampler, RandomSampler):
            raise NotImplementedError(
                f'{self.__class__.__name__} length (`__len__`) cannot'
                ' be determined. The base sampler used is an instance of'
                '`RandomSampler`, which will result in'
                f' {self.__class__.__name__} producing a nondeterministic'
                ' number of batches. When using this sampler with stream'
                ' packing avoid requiring the length.')

        return len(list(self.__iter__()))


================================================
FILE: poptorch_geometric/python/types.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
from itertools import chain
try:
    from functools import singledispatchmethod
except ImportError:
    from singledispatchmethod import singledispatchmethod
from typing import Any, Generator, Union, Iterable, List

import torch
from torch_geometric.data import Batch, Data, HeteroData
from torch_geometric.data.storage import BaseStorage
from torch_geometric.data.data import BaseData

from poptorch import ICustomArgParser, registerCustomArgParser

from poptorch_geometric.common import DataBatch, HeteroDataBatch, call_once


class PyGArgsParser(ICustomArgParser):
    @staticmethod
    def _sortedTensorKeys(struct: Union[Data, DataBatch]) -> Iterable[str]:
        all_keys = sorted(struct.keys)

        def isTensor(k):
            return isinstance(struct[k], torch.Tensor)

        return filter(isTensor, all_keys)

    @singledispatchmethod
    def yieldTensors(self, struct) -> Generator[torch.Tensor, None, None]:
        raise ValueError(f'Unsupported data type: {type(struct)}')

    @yieldTensors.register
    def _(self, struct: Data
          or DataBatch) -> Generator[torch.Tensor, None, None]:
        for k in self._sortedTensorKeys(struct):
            yield struct[k]

    @yieldTensors.register
    def _(self, struct: HeteroData
          or HeteroDataBatch) -> Generator[torch.Tensor, None, None]:
        def isTensor(val):
            return isinstance(val, torch.Tensor)

        for v in filter(isTensor, struct._global_store.values()):  # pylint: disable=protected-access
            yield v
        for attr in chain(struct.node_stores, struct.edge_stores):
            if isinstance(attr, BaseStorage):
                for v in filter(isTensor, attr.values()):
                    yield v

    @staticmethod
    def _setup_num_fields(
            batch: Union[DataBatch, HeteroDataBatch],
            original_structure: Union[DataBatch, HeteroDataBatch]):
        if hasattr(original_structure, '_num_graphs'):
            batch._num_graphs = original_structure._num_graphs  # pylint: disable=protected-access

        num_nodes = original_structure.num_nodes
        num_edges = original_structure.num_edges
        batch['num_nodes'] = num_nodes
        batch['num_edges'] = num_edges
        if isinstance(batch, HeteroDataBatch):
            # We need to override properties getters, to make them return the
            # proper (device iterations independent) `num_nodes` and `num_edges`
            # The general idea is to return values from `num_nodes` or
            # `num_edges` fields (if defined) in the first place.
            def nodes_fget(sub_self):
                if 'num_nodes' in sub_self._global_store:  # pylint: disable=protected-access
                    return sub_self['num_nodes']
                return super(type(sub_self), sub_self).num_nodes

            setattr(HeteroDataBatch, 'num_nodes', property(fget=nodes_fget))

            def edges_fget(sub_self):
                if 'num_edges' in sub_self._global_store:  # pylint: disable=protected-access
                    return sub_self['num_edges']
                return super(type(sub_self), sub_self).num_edges

            setattr(HeteroDataBatch, 'num_edges', property(fget=edges_fget))

    @staticmethod
    def _add_next(tensor_iterator: Iterable[List[Any]],
                  original_struct_val: Any) -> Any:
        if isinstance(original_struct_val, torch.Tensor):
            return next(tensor_iterator)
        return original_struct_val

    @singledispatchmethod
    def reconstruct(self, original_structure,
                    tensor_iterator: Iterable[torch.Tensor]) -> Any:  # pylint: disable=unused-argument
        raise ValueError(f'Unsupported data type: {type(original_structure)}')

    @reconstruct.register
    def _(self, original_structure: Data or DataBatch,
          tensor_iterator: Iterable[torch.Tensor]) -> Union[Data, DataBatch]:
        """
        Create a new instance with the same class type as the
        original_structure. This new instance will be initialized with tensors
        from the provided iterator and uses the same sorted keys from the
        yieldTensors() implementation.
        """
        tensor_keys = self._sortedTensorKeys(original_structure)

        kwargs = dict()
        for key in tensor_keys:
            kwargs[key] = self._add_next(tensor_iterator,
                                         original_structure[key])

        cls = original_structure.__class__
        if cls is DataBatch:
            batch = Batch(**kwargs, _base_cls=Data)
            self._setup_num_fields(batch, original_structure)
            return batch

        return Data(**kwargs)

    @reconstruct.register
    def _(self, original_structure: HeteroData or HeteroDataBatch,
          tensor_iterator: Iterable[torch.Tensor]
          ) -> Union[HeteroData, HeteroDataBatch]:
        """
        Create a new instance with the same class type as the
        original_structure. This new instance will be initialized with tensors
        from the provided iterator and uses the same sorted keys from the
        yieldTensors() implementation.
        """
        kwargs = dict()

        for key, attr in original_structure._global_store.items():  # pylint: disable=protected-access
            kwargs[key] = self._add_next(tensor_iterator, attr)

        for key, attr in chain(original_structure.node_items(),
                               original_structure.edge_items()):
            if isinstance(attr, BaseStorage):
                kwargs[key] = {
                    k: self._add_next(tensor_iterator, v)
                    for k, v in attr.items()
                }
            else:
                kwargs[key] = self._add_next(attr, attr)

        cls = original_structure.__class__
        if cls is HeteroDataBatch:
            batch = Batch(kwargs, _base_cls=HeteroData)
            self._setup_num_fields(batch, original_structure)
            return batch

        return HeteroData(kwargs)


# PyG uses the BaseData object as the root for data and batch objects.
@call_once
def registerCustomArgParsers():
    registerCustomArgParser(BaseData, PyGArgsParser())
    registerCustomArgParser(DataBatch, PyGArgsParser())
    registerCustomArgParser(HeteroDataBatch, PyGArgsParser())


registerCustomArgParsers()


================================================
FILE: poptorch_geometric/python/utils.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import time
from copy import deepcopy

import torch
from torch.testing import assert_close
from torch_geometric.nn import MessagePassing

import poptorch


def set_aggregation_dim_size(model: torch.nn.Module, dim_size: int):
    """Sets the dim_size argument used in the aggregate step of message passing

        The dim_size will need to be at least as large as the total number of
        nodes in the batch.
    """

    def set_dim_size_hook(module, inputs):  # pylint: disable=unused-argument
        aggr_kwargs = inputs[-1]
        aggr_kwargs['dim_size'] = dim_size
        return aggr_kwargs

    for module in model.modules():
        if isinstance(module, MessagePassing):
            module.register_aggregate_forward_pre_hook(set_dim_size_hook)


class TrainingStepper:
    """
    Test utility for comparing training runs between IPU and CPU.

    Usage:

        model = ...
        batch = ...
        model.train()
        stepper = TrainingSteper(model)
        stepper.run(10, batch)
    """

    def __init__(self,
                 model,
                 lr=0.001,
                 optimizer=poptorch.optim.Adam,
                 options=None,
                 rtol=None,
                 atol=None,
                 enable_fp_exception=True,
                 equal_nan=False):
        super().__init__()
        model.train()
        self.lr = lr
        self.rtol = rtol
        self.atol = atol
        self.equal_nan = equal_nan
        self.enable_fp_exception = enable_fp_exception
        self.options = poptorch.Options() if options is None else options
        self.training_model = None
        self.inference_model = None
        self.setup_cpu(model, optimizer)
        self.setup_ipu(model, optimizer)
        self.check_parameters()

    def setup_cpu(self, model, optimizer):
        self.cpu_model = deepcopy(model)
        parameters = list(self.cpu_model.parameters())
        if parameters:
            self.optimizer = optimizer(parameters, lr=self.lr)

    def setup_ipu(self, model, optimizer):
        self.ipu_model = deepcopy(model)
        options = self.options
        if self.enable_fp_exception:
            options.Precision.enableFloatingPointExceptions(True)

        parameters = list(self.ipu_model.parameters())
        if parameters:
            ipu_optimizer = optimizer(parameters, lr=self.lr)
            self.training_model = poptorch.trainingModel(
                self.ipu_model, optimizer=ipu_optimizer, options=options)

        self.inference_model = poptorch.inferenceModel(self.ipu_model,
                                                       options=options)

    def check_parameters(self):
        for cpu, ipu in zip(self.cpu_model.named_parameters(),
                            self.ipu_model.named_parameters()):
            name, cpu = cpu
            ipu = ipu[1]
            self.assert_close(actual=ipu, expected=cpu, id=name)

    def cpu_step(self, batch):
        self.optimizer.zero_grad()
        out, loss = self.cpu_model(*batch)
        loss.backward()
        self.optimizer.step()
        return out, loss

    def ipu_step(self, batch, copy_weights=True):
        out, loss = self.training_model(*batch)
        if copy_weights:
            self.training_model.copyWeightsToHost()
        return out, loss

    def run(self, *args):
        assert self.training_model, 'Training model was not created.'
        self.cpu_model.train()
        if len(args) == 2:
            self._run_common_input(*args)
        elif len(args) == 3:
            self._run_separate_inputs(*args)
        assert True, f"Wrong number of args ({len(args)}!)"

    def run_inference(self, batch):
        self.cpu_model.eval()
        with torch.no_grad():
            cpu_out = self.cpu_model(*batch)

        ipu_out, _ = self.inference_model(*batch)
        self.assert_close(actual=ipu_out, expected=cpu_out, id="inference")

    def _run_common_input(self, num_steps, batch):
        cpu_loss = torch.empty(num_steps)
        ipu_loss = torch.empty(num_steps)

        for i in range(num_steps):
            cpu_out, cpu_loss[i] = self.cpu_step(batch)
            ipu_out, ipu_loss[i] = self.ipu_step(batch)
            self.assert_close(actual=ipu_out, expected=cpu_out, id="Output")
            self.check_parameters()

        self.assert_close(actual=ipu_loss, expected=cpu_loss, id="loss")

    def _run_separate_inputs(self, num_steps, cpu_batch, ipu_batch):
        cpu_loss = torch.empty(num_steps)
        ipu_loss = torch.empty(num_steps)

        for i in range(num_steps):
            cpu_out, cpu_loss[i] = self.cpu_step(cpu_batch)
            ipu_out, ipu_loss[i] = self.ipu_step(ipu_batch)
            min_shape = min(cpu_out.shape[0], ipu_out.shape[0])
            self.assert_close(actual=ipu_out[:min_shape],
                              expected=cpu_out[:min_shape],
                              id="Output")
            self.check_parameters()
        self.assert_close(actual=ipu_loss, expected=cpu_loss, id="loss")

    def assert_close(self, actual, expected, id):
        def msg_fn(msg):
            return f"{id} was not equal:\n\n{msg}\n"

        assert_close(actual=actual,
                     expected=expected,
                     msg=msg_fn,
                     rtol=self.rtol,
                     atol=self.atol,
                     equal_nan=self.equal_nan)

    def benchmark(self, num_steps, batch, devices=('ipu')):
        results = {}
        if 'ipu' in devices:
            _, _ = self.ipu_step(batch, copy_weights=False)
            t_start = time.perf_counter()
            for _ in range(num_steps):
                _, _ = self.ipu_step(batch, copy_weights=False)
            t_end = time.perf_counter()
            results['ipu_time'] = t_end - t_start
        if 'cpu' in devices:
            _, _ = self.cpu_step(batch)
            t_start_cpu = time.perf_counter()
            for _ in range(num_steps):
                _, _ = self.cpu_step(batch)
            t_end_cpu = time.perf_counter()
            results['cpu_time'] = t_end_cpu - t_start_cpu
        if 'gpu' in devices:
            results['gpu_time'] = None
            raise NotImplementedError('GPU benchmarking currently unsupported')
        return results


================================================
FILE: poptorch_geometric/requirements.txt
================================================
# Install pre-built wheels for PyTorch Geometric that are compatible with
# poptorch which is currently pinned to torch 2.0.1
--find-links https://data.pyg.org/whl/torch-2.0.1+cpu.html

pyg-nightly==2.4.0.dev20230613

torch-scatter==2.1.1+pt20cpu
torch-sparse==0.6.17+pt20cpu
torch-cluster==1.6.1+pt20cpu
torch-spline-conv==1.2.2+pt20cpu

pytest-benchmark==4.0.0
pytest-cov==4.0.0
nbconvert==7.2.9
nbformat==5.7.3
pandas==2.0.1

singledispatchmethod==1.0; python_version < '3.8'


================================================
FILE: poptorch_geometric/setup.cfg
================================================
[metadata]
license_files =
   License.txt
   poptorch_geometric_third_party_licenses.txt


================================================
FILE: poptorch_geometric/setup.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import sys

from setuptools import setup, find_packages

REQUIRES = [
    '@PYG_DEPENDENCY@',
    '@POPTORCH_DEPENDENCY@',
    '@TORCH_SCATTER_DEPENDENCY@',
    '@TORCH_SPARSE_DEPENDENCY@',
]

python_version = f'{sys.version_info.major}.{sys.version_info.minor}'

if python_version == '3.7':
    REQUIRES.append('singledispatchmethod==1.0')

VERSION = '@VERSION@'

LONG_DESCRIPTION = (
    'PopTorch Geometric is a set of extensions for PyTorch Geometric, enabling '
    'GNN models to be trained, evaluated and used on the Graphcore IPU.')

setup(name='poptorch_geometric',
      version=VERSION,
      description=LONG_DESCRIPTION,
      long_description=LONG_DESCRIPTION,
      long_description_content_type='text/markdown',
      license='MIT License',
      license_files=('License.txt',
                     'poptorch_geometric_third_party_licenses.txt'),
      author='Graphcore Ltd.',
      author_email='contact@graphcore.ai',
      url='http://graphcore.ai',
      classifiers=[
          'Development Status :: 3 - Alpha',
          'Intended Audience :: Developers',
          'Intended Audience :: Science/Research',
          'Topic :: Scientific/Engineering',
          'Topic :: Scientific/Engineering :: Artificial Intelligence',
          'License :: OSI Approved :: MIT License',
          'Programming Language :: Python :: 3',
      ],
      platforms='@PLATFORM@',
      install_requires=REQUIRES,
      python_requires=f'=={python_version}.*',
      packages=find_packages())


================================================
FILE: poptorch_logging/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
project(poptorch_logging)

set(CMAKE_POSITION_INDEPENDENT_CODE ON)

find_package(spdlog 1.8.0 EXACT REQUIRED)

# Packages provided by Poplar
find_package(libpvti REQUIRED)
find_package(gccs REQUIRED)

add_library(poptorch_logging STATIC
  "source/Error.cpp"
  "source/Logging.cpp"
  "source/Tracepoint.cpp")

file(GLOB_RECURSE poptorch_logging_public_headers "${CMAKE_CURRENT_SOURCE_DIR}/include/*.hpp*")

set_target_properties(poptorch_logging PROPERTIES
  CXX_STANDARD 14
  PUBLIC_HEADER "${poptorch_logging_public_headers}")

target_include_directories(poptorch_logging SYSTEM
  PUBLIC
    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
    $<INSTALL_INTERFACE:include>)

# Unfortunately, there seems to be an issue with using the `spdlog::*` targets
# directly with `target_link_libraries()`, which breaks dependencies of
# `poptorch_logging` adding any other include directories. Instead, we'll
# manually add spdlog's include directories and compile definitions here.
target_include_directories(poptorch_logging SYSTEM
  PUBLIC
    $<TARGET_PROPERTY:spdlog::spdlog_header_only,INTERFACE_INCLUDE_DIRECTORIES>)
target_compile_definitions(poptorch_logging
  PUBLIC
    $<TARGET_PROPERTY:spdlog::spdlog_header_only,INTERFACE_COMPILE_DEFINITIONS>)

target_link_libraries(poptorch_logging
  PRIVATE
    libpvti
    gccs_stacktrace)

install(TARGETS poptorch_logging
  LIBRARY
    DESTINATION ${CMAKE_INSTALL_LIBDIR}
  PUBLIC_HEADER
    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/poptorch_logging)


================================================
FILE: poptorch_logging/include/poptorch_logging/Error.hpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#ifndef INCLUDE_POPTORCH_LOGGING_ERROR_HPP
#define INCLUDE_POPTORCH_LOGGING_ERROR_HPP

#include <algorithm>
#include <iostream>
#include <memory>
#include <sstream>
#include <stdexcept>
#include <string>

namespace poptorch {
namespace logging {

namespace detail {
struct LogContextImpl;
struct ErrorImpl;
} // namespace detail

// Remove everything before the last occurrence of "/poptorch/" in a string
// For example given an absolute path like:
// /a/b/c/poptorch/d/e/f.cpp -> poptorch/d/e/f.cpp
const char *shortPoptorchFilename(const char *filename);

#define UNLIKELY(var) __builtin_expect(var, 0)

#define UNUSED(var) (void)(var)

#define ERROR(msg)                                                             \
  do {                                                                         \
    std::stringstream __error_msg;                                             \
    __error_msg << msg; /* NOLINT */                                           \
    throw ::poptorch::logging::InternalError(__error_msg.str().c_str(),        \
                                             __FILE__, __LINE__);              \
  } while (0)

#define ERROR_ON_MSG(condition, msg)                                           \
  do {                                                                         \
    if (UNLIKELY(static_cast<bool>(condition))) {                              \
      ERROR(msg);                                                              \
    }                                                                          \
  } while (0)

#define ERROR_ON(condition) ERROR_ON_MSG(condition, #condition)

/**
 * Exception class for poptorch.
 *
 * The what() method returns both the error message and the
 * stacktrace.
 * To have the error without the stacktrace use message().
 */
class Error : public std::runtime_error {
public:
  explicit Error(const char *s, const char *file, uint64_t line);
  Error(Error &&e);
  const char *file() const;
  uint64_t line() const;
  // The error message without the stacktrace
  const char *message() const;
  ~Error() override;

private:
  std::unique_ptr<detail::ErrorImpl> _impl;
};

/**
 * Exception class specific to internal errors
 * This should be used as an assert; for states where the user should not have
 * been able to create.
 */
class InternalError : public Error {
public:
  using Error::Error;
};

/* Context stack used to attach extra information to exceptions when they're
 * raised. All contexts changes can be printed by enabling the info mode.
 */
class LogContext {
public:
  // Current context stack as a string
  static std::unique_ptr<char[]> context();
  static void resetContext();
  static void push(const char *);

  LogContext();
  // Push the context at the top of the context stack.
  explicit LogContext(const std::string &context)
      : LogContext(context.c_str()) {}
  explicit LogContext(const char *context);

  // Replace the top of the context stack with new_context.
  void updateContext(const std::string &new_context);

  // Pop the top of the context stack.
  void clear();
  // Implicitly pop the top of the context stack if clear() hasn't been
  // explicitly called.
  ~LogContext();

private:
  std::unique_ptr<detail::LogContextImpl> _impl;
};

} // namespace logging
} // namespace poptorch

#endif // INCLUDE_POPTORCH_LOGGING_ERROR_HPP


================================================
FILE: poptorch_logging/include/poptorch_logging/Logging.hpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#ifndef INCLUDE_POPTORCH_LOGGING_H
#define INCLUDE_POPTORCH_LOGGING_H

#include <spdlog/fmt/fmt.h>
#include <spdlog/fmt/ostr.h>
#include <string>
#include <utility>

#include "poptorch_logging/LoggingLight.hpp"

/// This is a simple logging system for poptorch based on spdlog. The easiest
/// way to use it is to simply call `logging::<level>()` where <level> is one
/// of trace, debug, info, warn or err. For example:
///
///   #include <core/logging/logging.hpp>
///
///   void foo(int i) {
///     logging::info("foo({}) called", i);
///   }
///
/// logging can be configured by the methods below, or by environment
/// variables, eg
/// POPTORCH_LOG_LEVEL=ERR
/// POPTORCH_LOG_DEST=Mylog.txt
///
/// Formatting is done using the `fmt` library. It supports {}-style and %-style
/// format specification strings. See https://github.com/fmtlib/fmt for details.

namespace poptorch {
namespace logging {

// Log a formatted message. This uses the `fmt` C++ library for formatting.
// See https://github.com/fmtlib/fmt for details. You should probably use
// the MAKE_LOG_TEMPLATE macros instead, e.g.
// logging::debug("The answer is: {}", 42).
template <typename... Args>
void log(Level l, const char *s, const Args &...args) {
  // Avoid formatting if the logging is disabled anyway.
  if (shouldLog(l)) {
    const std::string str = fmt::format(s, args...);
    log(l, str.c_str());
  }
}

// Create a bit of syntactic sugar which allows log statements
// of the form logging::debug("Msg").
#define MAKE_LOG_TEMPLATE(fnName, lvl)                                         \
  template <typename... Args>                                                  \
  inline void fnName(const char *s, const Args &...args) {                     \
    log(Level::lvl, s, std::forward<const Args>(args)...);                     \
  }                                                                            \
                                                                               \
  template <typename... Args>                                                  \
  inline void fnName(std::uint64_t &dedup_count, const char *s,                \
                     const Args &...args) {                                    \
    std::uint64_t rlimit = repeatLimit();                                      \
    if (dedup_count > rlimit) {                                                \
      return;                                                                  \
    }                                                                          \
    if (dedup_count < rlimit) {                                                \
      log(Level::lvl, s, std::forward<const Args>(args)...);                   \
    } else {                                                                   \
      log(Level::lvl, "...repeated messages suppressed...");                   \
    }                                                                          \
    dedup_count++;                                                             \
  }

MAKE_LOG_TEMPLATE(trace, Trace)
MAKE_LOG_TEMPLATE(debug, Debug)
MAKE_LOG_TEMPLATE(info, Info)
MAKE_LOG_TEMPLATE(warn, Warn)
MAKE_LOG_TEMPLATE(err, Err)

#undef MAKE_LOG_TEMPLATE

// Convenience macro to create a log entry prefixed with function name e.g.:
//    void someFunc(int i) {
//      FUNC_LOGGER(info, " with i := {}", i);
//    }
// Then the log entry would be something like:
// 14:30:31.00 [I] void someFunc(int): with i := 42
// NOTE: Because of the limitations of __VA_ARGS__ this log entry must have at
// least one parameter.
#define FUNC_LOGGER(lvl, fmtStr, ...)                                          \
  logging::lvl("{}: " fmtStr, __PRETTY_FUNCTION__, __VA_ARGS__)

#undef FUNC_LOGGER

} // namespace logging
} // namespace poptorch

#endif // INCLUDE_POPTORCH_LOGGING_H


================================================
FILE: poptorch_logging/include/poptorch_logging/LoggingLight.hpp
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
#ifndef INCLUDE_POPTORCH_LOGGING_LIGHT_H
#define INCLUDE_POPTORCH_LOGGING_LIGHT_H

#include <string>
#include <utility>

// This header is a lighter version of poptorch_logging which doesn't require
// spdlog and therefore doesn't support formatting.
//
// For the full version of the logging API use
// poptorch_logging/Logging.hpp instead.
namespace poptorch {
namespace logging {

enum class Level {
  Trace = 0,
  Debug = 1,
  Info = 2,
  Warn = 3,
  Err = 4,
  // level 5 is "critical" in spdlog, which we don't use so isn't exposed here.
  Off = 6,
};

// Set the current log level to one of the above levels. The default
// log level is set by the POPTORCH_LOG_LEVEL environment variable
// and is off by default.
void setLogLevel(Level l);

// Return true if the passed log level is currently enabled.
bool shouldLog(Level l);

// Return true if the Popart IR should be dumped.
bool outputPopartIR();

// Return number of times logs should be allowed to repeat
std::uint64_t repeatLimit();

void setRepeatLimit(std::uint64_t limit);

// Flush the log. By default it is only flushed when the underlying libc
// decides to.
void flush();

// Log a message. You should probably use the MAKE_LOG_TEMPLATE macros
// instead, e.g. logging::debug("A debug message").
void log(Level l, const char *msg);

} // namespace logging
} // namespace poptorch

#endif // INCLUDE_POPTORCH_LOGGING_LIGHT_H


================================================
FILE: poptorch_logging/include/poptorch_logging/Tracepoint.hpp
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
#ifndef SOURCE_INCLUDE_POPTORCH_TRACEPOINT_HPP
#define SOURCE_INCLUDE_POPTORCH_TRACEPOINT_HPP

#include <algorithm>
#include <memory>
#include <string>

namespace poptorch {
namespace logging {

namespace detail {
class TracepointImpl;
}

/** RAII class to create tracepoints
 */
class Tracepoint {
public:
  explicit Tracepoint(const char *label);
  ~Tracepoint();

  static void begin(const char *label);
  static void end(const char *label);

private:
  std::unique_ptr<detail::TracepointImpl> _impl;
};

inline std::string formatPrettyFunction(const char *c) {
  std::string s(c);
  // Find the namespace(s)::class::method substring

  // First locate the start of the arguments
  auto j = std::find(s.begin(), s.end(), '(');

  // Second find the last space before the arguments
  // PRETTY_FUNCTION can return "virtual void poptorch::...."
  auto i = std::find(std::make_reverse_iterator(j), s.rend(), ' ');

  // Get the position of the beginning of the substring
  auto begin_pos = s.size() - static_cast<std::size_t>(i - s.rbegin());
  // Get the size of the substring
  auto size = static_cast<std::size_t>(j - s.begin()) - begin_pos;
  return s.substr(begin_pos, size);
}

#define POPTORCH_TRACEPOINT()                                                  \
  poptorch::logging::Tracepoint tp {                                           \
    poptorch::logging::formatPrettyFunction(__PRETTY_FUNCTION__).c_str()       \
  }

#define POPTORCH_TRACEPOINT_WITH_DEBUG_INFO(debug_info)                        \
  std::stringstream ss;                                                        \
  ss << poptorch::logging::formatPrettyFunction(__PRETTY_FUNCTION__) << " ("   \
     << (debug_info) << ")";                                                   \
  poptorch::logging::Tracepoint tp { ss.str().c_str() }

} // namespace logging
} // namespace poptorch

#endif // SOURCE_INCLUDE_POPTORCH_TRACEPOINT_HPP


================================================
FILE: poptorch_logging/source/Error.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#include "poptorch_logging/Error.hpp"

#include <vector>

// This is a wrapper for boost::stacktrace without exposing Boost.
#include <gccs/StackTrace.hpp>

#include "poptorch_logging/Logging.hpp"

namespace poptorch {
namespace logging {

namespace {
using Context = std::vector<std::string>;

Context &getContext() {
  static thread_local Context log_context{};
  return log_context;
}

std::string singleLineContext() {
  std::stringstream ss;
  std::string sep{};
  for (const auto &lvl : getContext()) {
    ss << sep << lvl;
    sep = " -> ";
  }
  return ss.str();
}

std::string getStackTrace() {
  std::stringstream out;
  // 3 to get out of gccs + getStackTrace + Error constructor
  constexpr size_t num_frames_to_skip = 3;
  constexpr size_t max_depth = 100;
  out << "\nStacktrace:\n"
      << gccs::getStackTrace(num_frames_to_skip, max_depth);
  return out.str();
}

} // namespace

const char *shortPoptorchFilename(const char *filename) {
  auto pos = std::string(filename).rfind("/poptorch/");
  if (pos == std::string::npos) {
    return filename;
  }
  return filename + pos + 1; // NOLINT
}

namespace detail {
struct LogContextImpl {
  LogContextImpl() : cleared(true) {}
  bool cleared;
  static bool trace_enabled;
};

bool LogContextImpl::trace_enabled = []() {
  auto *level = std::getenv("POPTORCH_LOG_LEVEL");
  if (level == nullptr) {
    return false;
  }
  return std::string(level) == "TRACE_ALL";
}();

struct ErrorImpl {
  std::string file;
  std::string message;
  uint64_t line;
};

} // namespace detail

Error::~Error() = default;

Error::Error(Error &&e)
    : std::runtime_error(e.what()), _impl(std::move(e._impl)) {}

Error::Error(const char *s, const char *file, uint64_t line)
    : std::runtime_error(std::string(s) + getStackTrace()),
      _impl(std::make_unique<detail::ErrorImpl>()) {
  _impl->file = logging::shortPoptorchFilename(file);
  _impl->line = line;
  _impl->message = s;
}

const char *Error::message() const { return _impl->message.c_str(); }

const char *Error::file() const { return _impl->file.c_str(); }

uint64_t Error::line() const { return _impl->line; }

LogContext::LogContext() : _impl(std::make_unique<detail::LogContextImpl>()) {}

LogContext::LogContext(const char *context) : LogContext() {
  updateContext(context);
}

void LogContext::updateContext(const std::string &new_context) {
  clear();
  getContext().push_back(new_context);
  _impl->cleared = false;
  if (detail::LogContextImpl::trace_enabled) {
    logging::trace("[{}] Start", singleLineContext());
  }
}

void LogContext::clear() {
  if (!_impl->cleared) {
    // Don't restore the saved context if we're handling an exception
    // we might want to recover the context later.
    if (std::uncaught_exceptions() == 0) {
      if (detail::LogContextImpl::trace_enabled && !getContext().empty()) {
        logging::trace("[{}] End", singleLineContext());
      }
      // Don't restore the saved context if the context has been cleared.
      if (!getContext().empty()) {
        getContext().pop_back();
      }
    }
    _impl->cleared = true;
  }
}

LogContext::~LogContext() { clear(); }

/* static */ std::unique_ptr<char[]> LogContext::context() {
  std::stringstream ss;
  auto &ctx = getContext();
  if (ctx.empty()) {
    return nullptr;
  }
  for (int64_t idx = ctx.size() - 1; idx >= 0; --idx) {
    ss << "  [" << ctx.size() - idx - 1 << "] " << ctx.at(idx) << std::endl;
  }

  std::string str = ss.str();
  auto ptr = std::unique_ptr<char[]>(new char[str.size() + 1]);
  str.copy(ptr.get(), std::string::npos);
  ptr.get()[str.size()] = '\0';
  return ptr;
}

/* static */ void LogContext::resetContext() { return getContext().clear(); }
/* static */ void LogContext::push(const char *new_context) {
  getContext().push_back(new_context);
}

} // namespace logging
} // namespace poptorch


================================================
FILE: poptorch_logging/source/Logging.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
#include "poptorch_logging/Logging.hpp"

#include <spdlog/spdlog.h>

#include <spdlog/fmt/fmt.h>
#include <spdlog/sinks/ansicolor_sink.h>
#include <spdlog/sinks/basic_file_sink.h>
#include <spdlog/sinks/null_sink.h>
#include <spdlog/sinks/ostream_sink.h>

#include <iostream>
#include <string>

namespace poptorch {
namespace logging {

namespace {

// Check our enums match (incase spdlog changes under us)
static_assert(static_cast<spdlog::level::level_enum>(Level::Trace) ==
                  spdlog::level::trace,
              "Logging enum mismatch");
static_assert(static_cast<spdlog::level::level_enum>(Level::Off) ==
                  spdlog::level::off,
              "Logging enum mismatch");

// Translate to a speedlog log level.
spdlog::level::level_enum translate(Level l) {
  return static_cast<spdlog::level::level_enum>(l);
}

// Stores the logging object needed by spdlog.
struct LoggingContext {
  LoggingContext();
  std::shared_ptr<spdlog::logger> logger;
  bool output_popart_ir{false};
  std::uint64_t repeat_limit{4u};
};

LoggingContext &context() {
  // This avoids the static initialisation order fiasco, but doesn't solve the
  // deinitialisation order. Who logs in destructors anyway?
  static thread_local LoggingContext logging_context;
  return logging_context;
}

Level logLevelFromString(const std::string &level) {
  if (level == "TRACE" || level == "TRACE_ALL") {
    return Level::Trace;
  }
  if (level == "DEBUG" || level == "DEBUG_IR") {
    return Level::Debug;
  }
  if (level == "INFO") {
    return Level::Info;
  }
  if (level == "WARN") {
    return Level::Warn;
  }
  if (level == "ERR") {
    return Level::Err;
  }
  if (level == "OFF" || level.empty()) {
    return Level::Off;
  }

  throw std::runtime_error(fmt::format(
      "Unknown POPTORCH_LOG_LEVEL '{}'. Valid values are TRACE_ALL, TRACE, "
      "DEBUG, DEBUG_IR, INFO, WARN, ERR and OFF.",
      level));
}

template <typename Mutex>
void setColours(spdlog::sinks::ansicolor_sink<Mutex> &sink) {
  // See https://en.wikipedia.org/wiki/ANSI_escape_code#Colors
  // Ansi colours make zero sense.
  static const std::string bright_black = "\033[90m";

  sink.set_color(spdlog::level::trace, bright_black);
  sink.set_color(spdlog::level::debug, sink.cyan);
  sink.set_color(spdlog::level::info, sink.white);
  sink.set_color(spdlog::level::warn, sink.yellow_bold);
  sink.set_color(spdlog::level::err, sink.red_bold);
}

LoggingContext::LoggingContext() {
  auto *poptorch_log_dest = std::getenv("POPTORCH_LOG_DEST");
  auto *poptorch_log_level = std::getenv("POPTORCH_LOG_LEVEL");

  // Get logging output from the POPTORCH_LOG_DEST environment variable.
  // The valid options are "stdout", "stderr", or if it is neither
  // of those it is treated as a filename. The default is stderr.
  const std::string log_dest =
      poptorch_log_dest != nullptr ? poptorch_log_dest : "stderr";
  const std::string log_level =
      poptorch_log_level != nullptr ? poptorch_log_level : "WARN";

  // Get logging level from OS ENV. The default level is off.
  Level default_level = logLevelFromString(log_level);

  if (log_dest == "stdout") {
    auto sink = std::shared_ptr<spdlog::sinks::ansicolor_stdout_sink_mt>();
    setColours(*sink);
    logger = std::make_shared<spdlog::logger>("graphcore", sink);
  } else if (log_dest == "stderr") {
    auto sink = std::make_shared<spdlog::sinks::ansicolor_stderr_sink_mt>();
    setColours(*sink);
    logger = std::make_shared<spdlog::logger>("graphcore", sink);
  } else {
    try {
      logger = spdlog::basic_logger_mt("graphcore", log_dest, true);
    } catch (const spdlog::spdlog_ex &e) {
      std::cerr << "Error opening log file: " << e.what() << std::endl;
      throw;
    }
  }

  logger->set_pattern("%^[%T.%e] [poptorch:cpp] [%l] %v%$");
  logger->set_level(translate(default_level));
  output_popart_ir = log_level == "DEBUG_IR";
}

} // namespace

bool outputPopartIR() {
  return context().output_popart_ir || shouldLog(Level::Trace);
}

std::uint64_t repeatLimit() { return context().repeat_limit; }

void setRepeatLimit(std::uint64_t limit) { context().repeat_limit = limit; }

void log(Level l, const char *msg) { context().logger->log(translate(l), msg); }

bool shouldLog(Level l) { return context().logger->should_log(translate(l)); }

void setLogLevel(Level l) { context().logger->set_level(translate(l)); }

void flush() { context().logger->flush(); }

} // namespace logging
} // namespace poptorch


================================================
FILE: poptorch_logging/source/Tracepoint.cpp
================================================
// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
#include "poptorch_logging/Tracepoint.hpp"

#include <pvti/pvti.hpp>

#include "poptorch_logging/Error.hpp"

namespace poptorch {

namespace logging {

namespace detail {

class TracepointImpl : public pvti::Tracepoint {
public:
  explicit TracepointImpl(const std::string &label_)
      : pvti::Tracepoint(&TracepointImpl::channel, label_), ctx(label_) {}
  ~TracepointImpl() = default;
  static pvti::TraceChannel channel;
  LogContext ctx;
};

pvti::TraceChannel TracepointImpl::channel = {"poptorch"};
} // namespace detail

Tracepoint::Tracepoint(const char *label)
    : _impl(std::make_unique<detail::TracepointImpl>(std::string(label))) {}

void Tracepoint::begin(const char *label) {
  pvti::Tracepoint::begin(&detail::TracepointImpl::channel, label);
}

void Tracepoint::end(const char *label) {
  pvti::Tracepoint::end(&detail::TracepointImpl::channel, label);
}

Tracepoint::~Tracepoint() = default;

} // namespace logging
} // namespace poptorch


================================================
FILE: poptorch_third_party_licenses.txt
================================================
The PopTorch package includes the following third party software:

pybind11
--------
Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>, All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright notice, this
   list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following disclaimer in the documentation
   and/or other materials provided with the distribution.

3. Neither the name of the copyright holder nor the names of its contributors
   may be used to endorse or promote products derived from this software
   without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

Please also refer to the file
https://github.com/pybind/pybind11/blob/master/.github/CONTRIBUTING.md, which
clarifies licensing of external contributions to this project including patches,
pull requests, etc.


spdlog
--------
The MIT License (MIT)

Copyright (c) 2016 Gabi Melman.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

-- NOTE: Third party dependency used by this software --
This software depends on the fmt lib (MIT License),
and users must comply to its license: https://github.com/fmtlib/fmt/blob/master/LICENSE.rst


================================================
FILE: pyproject.toml
================================================
[build-system]
requires = [
  "python_version>=3.7",
  "setuptools>=42",
  "wheel",
  "pybind11>=2.8.0",
  "@TORCH_DEPENDENCY@",
]
build-backend = "setuptools.build_meta"

[tool.pytest.ini_options]
# Required to supress a warning from the package `ruamel` using a deprecated pkg_resources function.
filterwarnings = [
    "ignore::DeprecationWarning:pkg_resources.*",
    # Deprecation warnings from pillow in torchvision.
    "ignore:.*Pillow.*:DeprecationWarning:torchvision",
]


================================================
FILE: python/CMakeLists.txt
================================================
include(GNUInstallDirs)
file(GLOB python_files "${CMAKE_CURRENT_SOURCE_DIR}/*.py")

# __init__.py needs to be edited by set_version.py so don't copy it over.
list(REMOVE_ITEM python_files "${CMAKE_CURRENT_SOURCE_DIR}/__init__.py")

run_poptorch_install_command("python3 ${PROJECT_SOURCE_DIR}/scripts/set_version.py --torch-version ${TORCH_VERSION} ${CMAKE_CURRENT_BINARY_DIR}/__init__.py" "${PROJECT_SOURCE_DIR}" "Generate __init__.py")
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/__init__.py DESTINATION "${INSTALL_PYDIR}")
install(FILES ${python_files}  py.typed DESTINATION "${INSTALL_PYDIR}")

# Compile the Pybind11 module using setup.py (Called by generate_python_package.py
run_poptorch_install_command(
  "python3 ${PROJECT_SOURCE_DIR}/scripts/generate_python_package.py install --include-dir ${CMAKE_INSTALL_PREFIX}/include --lib-dir ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR} --output-dir ${CMAKE_INSTALL_PREFIX} --python-dir ${INSTALL_PYDIR}" "${PROJECT_SOURCE_DIR}" "poptorch_core.so module compilation")


================================================
FILE: python/__init__.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import abc
import atexit
import copy
import copyreg
import functools
import importlib
import os
from typing import Any, Callable, Dict, Iterator, Optional, Union, Type, Sequence, Iterable
import pickle
import pkg_resources
import torch

# These are needed before the assert
# pylint: disable=wrong-import-order
from . import _logging
from ._logging import logger
# pylint: enable=wrong-import-order

assert torch.__version__.startswith("@TORCH_VERSION@"), (
    "This version"
    " of PopTorch only works with torch==@TORCH_VERSION@ but the version "
    f"installed is {torch.__version__}")

# On POD the RDMA driver will hang if the parent process is forked after the
# driver was initialised.
# This would typically happen when a PyTorch Dataloader creates some workers.
# To avoid the issue we need to explicitly enable safe fork.
if "RDMAV_FORK_SAFE" not in os.environ:
    os.environ["RDMAV_FORK_SAFE"] = "1"

try:
    import poptorch.poptorch_core as poptorch_core  # type: ignore
except ImportError as e:
    raise ImportError("Unable to import PopTorch, this can be caused by "
                      "attempting to import PopTorch without an active Poplar "
                      "SDK.\n  The SDK can be enabled by running: "
                      "`source /path/to/poplar-sdk/enable`") from e

# pylint: disable=wrong-import-position
from poptorch.poptorch_core import Error, RecoverableError, UnrecoverableError, importPoptorchMetadataFromFile
from . import _dataloader
from . import _impl
from . import _poptorch_data
from . import _utils
from .enums import *
from .ops import *
from .options import *
from ._impl import isRunningOnIpu, createPoptorchError
from ._utils import accessAttributes, getIpuTensorId
from ._poplar_executor import PoplarExecutor, registerPreCompileHook, registerPostCompileHook, _OverwriteContextManager
from ._printing import *
from . import optim
from . import profiling
# pylint: enable=wrong-import-position

__version__ = "@VERSION@-@SNAPSHOT@"

# Use package discovery to pass the true filesystem path of the installed python
# package to C++. The path could later be used to pre-compile custom codelets
# on demand.
poptorch_core.setCustomCodeletsPath(
    pkg_resources.resource_filename("poptorch", ""))


@atexit.register
def poptorchAtExit():
    poptorch_core.poptorchAtExit()


def load(filename: str,
         edit_opts_fn: Optional[Callable[['poptorch.Options'], None]] = None
         ) -> 'poptorch.PoplarExecutor':
    """Load a PopTorch model from a file previously created using
    :py:meth:`~poptorch.PoplarExecutor.compileAndExport`

    :param edit_opts_fn: Function to edit the options before the model
        is restored. For example to attach to a specific IPU device.

    >>> model = poptorch.inferenceModel(model)
    >>> model.compileAndExport("my_model.poptorch")
    ...
    >>> model = poptorch.load("my_model.poptorch")
    >>> model(my_input)
    """

    serialized_data = importPoptorchMetadataFromFile(filename)

    try:
        data = _poptorch_data.parse(serialized_data, __version__)
    except AssertionError as e:
        raise AssertionError("Invalid file %s: %s" % (filename, e)) from e

    assert data.model and data.training is not None, (
        f"{filename} is a valid PopTorch file but was created"
        " with 'export_model=False' which means you need to re-create"
        " the PopTorch model using poptorch.inferenceModel or "
        "poptorch.trainingModel then call "
        f"poptorch_model.loadExecutable(\"{filename}\").")
    if edit_opts_fn:
        edit_opts_fn(data.options)
    if data.optimizer_state is not None:
        assert data.optimizer is not None
        data.optimizer.load_state_dict(data.optimizer_state)

    # It may look wrapped but not be in _impl._wrapper_types because it has been
    # loaded in a new session. Unwrap manually if so.
    wrapped_model_cls_str = (
        "poptorch._poplar_executor."
        "PoplarExecutor.__init__.<locals>.PoptorchModel'>")
    if wrapped_model_cls_str in str(data.model.__class__):
        data.model.__class__ = data.model.__class__.__bases__[0]

    if data.training:
        executor = trainingModel(data.model, data.options, data.optimizer)
    else:
        executor = inferenceModel(data.model, data.options)
    executor.loadExecutable(filename)
    if data.random_seed is not None:
        executor.random_seed = data.random_seed
    if data.rng_state is not None:
        executor.rng_state = data.rng_state
    return executor


class _SubDataset:
    """For distributed execution split the dataset into serial blocks of tensors

    All the tensors used by process 0, followed by all the tensors
    used by process 1, and so on.

    [p0, p0, p0, ..., p1, p1, p1, ..., p2,p2, p2]

    If shuffling is used, then the indices in the parent (entire) dataset are
    randomised and ``swap_range`` will be called every time a new iterator
    is created in order to make sure all the tensors get used.
    """

    def __init__(self, dataset, opts, step, drop_last):
        num_elts = len(dataset)
        # Note: all the processes must have the same number of batches
        # or it will hang.
        if drop_last:
            per_proc = step * (num_elts //
                               (step * opts.Distributed.numProcesses))
            self._offset = opts.Distributed.processId * per_proc
            self._length = min(per_proc, num_elts - self._offset)
            self._leftovers = num_elts % per_proc
        else:
            # If the user explicitly requested to not drop the left over elements
            # then evenly distribute them across all the processes and let the user
            # take care of padding the tensors.
            per_proc = [(num_elts // opts.Distributed.numProcesses) +
                        (num_elts % opts.Distributed.numProcesses > proc)
                        for proc in range(opts.Distributed.numProcesses)]
            self._offset = sum(per_proc[:opts.Distributed.processId])
            self._length = per_proc[opts.Distributed.processId]
            self._leftovers = 0

        self._base_offset = self._offset
        self._dataset = dataset
        self._seed = opts.random_seed if opts.exists('random_seed') else None
        self._shuffling_generator_state = None
        self._shuffled_global_indices = None

    def shuffle_global_indices(self):
        """Shuffles the indices across the entire dataset."""
        generator = torch.Generator()
        if self._shuffling_generator_state is None:
            assert self._seed is not None, (
                "Seed must be set when shuffling so that all "
                "instances end up with the same shuffled global indices.")
            generator.manual_seed(self._seed)
        else:
            generator.set_state(self._shuffling_generator_state)
        shuffled = torch.randperm(len(self._dataset), generator=generator)
        # Use shared memory so that the workers' indices
        # also get shuffled.
        if self._shuffled_global_indices is None:
            self._shuffled_global_indices = shuffled.share_memory_()
        else:
            self._shuffled_global_indices.copy_(shuffled)
        self._shuffling_generator_state = generator.get_state()

    def swap_range(self):
        """If there are leftovers in the randomly sampled dataset make sure
        they get included in the next iteration.

        For example if we've got: T = N * B + L
        T = total number of tensors
        N = number of full batches in T
        B = batch size
        L = Number of left over tensors

        First the dataset will return the tensors in [0, T-L]
        after ``swap_range`` was called the dataset will return tensors in
        [L, T]
        """
        if self._base_offset == self._offset:
            self._offset += self._leftovers
        else:
            self._offset = self._base_offset

    def __len__(self):
        return self._length

    def __getitem__(self, index):
        global_index = index + self._offset
        if self._shuffled_global_indices is not None:
            global_index = self._shuffled_global_indices[global_index]
        return self._dataset[global_index]


def _batch_sampler_len(
        batch_sampler: Union[torch.utils.data.
                             Sampler[Sequence], Iterable[Sequence]]):
    if hasattr(batch_sampler, "__len__"):
        try:
            length = len(batch_sampler)
            if length == NotImplemented:
                return None
            return length
        except NotImplementedError:
            return None

    return None


class DataLoader(torch.utils.data.DataLoader):
    """ Thin wrapper around the traditional `torch.utils.data.DataLoader` to
    abstract away some of the batch sizes calculations.

    If this data loader is used in a distributed execution environment, it will
    ensure that each process uses a different subset of the dataset, providing
    you first call ``options.randomSeed(N)`` with an integer N which is the same
    across all hosts.
    """

    def __init__(
            self,
            options: 'poptorch.Options',
            dataset: 'torch.utils.data.Dataset',
            batch_size: int = 1,
            shuffle: bool = None,
            num_workers: int = 0,
            drop_last: bool = True,
            persistent_workers: Optional[bool] = None,
            auto_distributed_partitioning: bool = True,
            mode: 'poptorch.DataLoaderMode' = DataLoaderMode.Sync,
            async_options: Optional[Dict[str, Any]] = None,
            rebatched_worker_size: Optional[int] = None,
            batch_sampler: Optional[Union[torch.utils.data.Sampler[Sequence],
                                          Iterable[Sequence]]] = None,
            **kwargs):
        """
        :param options: Options that will be used to compile
            and run the model.
        :param dataset: The dataset to get the data from.
        :param batch_size: This is the batch size in the conventional sense
            of being the size that runs through an operation in the model at
            any given time.
        :param shuffle: Whether or not the dataset should be shuffled.
        :param num_workers: Number of worker processes to use to read the
            data.
        :param drop_last: If True and the number of elements in the
            dataset is not a multiple of the combined batch size then the
            incomplete batch at the end will be dropped.
        :param persistent_workers: Re-use workers between
            iterations if True.
        :param auto_distributed_partitioning: If True, partitions the
            dataset for distributed execution automatically. Otherwise, it is
            assumed that partitioning has been handled manually.
        :param mode: If `DataLoaderMode.Async`, uses an
            :py:class:`~poptorch.AsynchronousDataAccessor` to access the
            dataset. If `DataLoaderMode.Sync`, accesses the dataset
            synchronously.
        :param async_options: Options to pass to
            :py:class:`~poptorch.AsynchronousDataAccessor`.
        :param rebatched_worker_size: When using AsyncRebatched: batch
            size of the tensors loaded by the workers.
            Default to the combined batch size.
            If specified the ``rebatched_worker_size`` must be less than
            or equal to the combined batch size.
        :param batch_sampler: Defines the strategy to draw samples from the
            dataset. Returns a batch of indices at a time. Mutually exclusive
            with `batch_size`, `shuffle`.
        :param kwargs: Other options to pass to PyTorch's ``DataLoader``
            constructor.
        """

        self._is_user_batch_sampler_set = batch_sampler is not None

        if self._is_user_batch_sampler_set:
            if batch_size != 1 or shuffle:
                raise createPoptorchError(
                    '`batch_sampler` option is mutually '
                    'exclusive with batch_size, shuffle.')
            if options.Distributed.numProcesses > 1 and \
                    auto_distributed_partitioning:
                raise createPoptorchError(
                    '`batch_sampler` option is mutually '
                    'exclusive with auto_distributed_partitioning=True.')
            if hasattr(batch_sampler, "batch_size"):
                batch_size = batch_sampler.batch_size
            self.batch_sampler_drop_last = drop_last
            drop_last = None
        else:
            if shuffle is None:
                shuffle = False

        assert isinstance(options, Options)
        options._freeze()  # pylint: disable=protected-access
        if persistent_workers is None:
            persistent_workers = num_workers > 0

        self._combined_batch_size: Optional[int]
        self._num_batches_to_combine: Optional[int]

        if batch_size is None:
            self._combined_batch_size = None
            self._num_batches_to_combine = None
        else:
            input_group_count = options.replication_factor // \
                                options.input_group_size
            self._num_batches_to_combine = options.device_iterations * \
                input_group_count * \
                options.Training.gradient_accumulation

            self._combined_batch_size = batch_size * \
                self._num_batches_to_combine
            self._options = options

        # Iterable datasets need to be handled differently: they don't have
        # __getitem__ and __len__
        self._is_iterable = isinstance(dataset,
                                       torch.utils.data.IterableDataset)
        self._shuffle_map_style_data_in_distributed_env = False

        self._accessor = None

        if self._is_iterable:
            if auto_distributed_partitioning:
                assert options.Distributed.numProcesses == 1, (
                    "auto_distributed_partitioning not supported for"
                    " IterableDataset")
            if num_workers > 1 and "worker_init_fn" not in kwargs:
                logger.warning(
                    "IterableDataset used with num_workers="
                    "%d but no worker_init_fn specified: as a result"
                    " the DataLoader will return %d times each element"
                    " in the dataset (See torch.utils.data.IterableDataset's"
                    " documentation for more information)", num_workers,
                    num_workers)

        else:
            num_elts = len(dataset)
            if not drop_last:
                if self._is_user_batch_sampler_set:
                    batch_sampler_len = _batch_sampler_len(batch_sampler)

                    if batch_sampler_len is not None:
                        num_incomplete_batches = batch_sampler_len % \
                            self._num_batches_to_combine

                        if num_incomplete_batches != 0:
                            logger.warning(
                                "The number of batches generated by the batch"
                                " sampler (%d) is not divisible by the number"
                                " of batches elements processed per step (%d)"
                                " and drop_last=False. The last tensor will"
                                " have a batch size of %d. To avoid having to "
                                " handle this special case switch to "
                                " drop_last=True. Batch size = %d,"
                                " combined batch size = %d .",
                                batch_sampler_len,
                                self._num_batches_to_combine,
                                num_incomplete_batches * batch_size,
                                batch_size, self._combined_batch_size)
                    else:
                        logger.warning(
                            "The `batch_sampler` __len__ method is not"
                            " implemented and drop_last=False. The last tensor"
                            " may be incomplete - batch size < %d. To avoid"
                            " having to handle this special case switch to"
                            " drop_last=True.", self._num_batches_to_combine)
                elif self._combined_batch_size is not None and \
                    num_elts % (self._combined_batch_size *
                                options.Distributed.numProcesses) != 0:
                    logger.warning(
                        "The number of elements in the dataset "
                        "(%d) is not divisible by the number of"
                        " elements processed per step (%d)"
                        " and drop_last=False. The last tensor will have "
                        "a batch size of %d. To avoid having to handle "
                        "this special case switch to drop_last=True", num_elts,
                        self._combined_batch_size *
                        options.Distributed.numProcesses,
                        num_elts % (self._combined_batch_size *
                                    options.Distributed.numProcesses))

            if options.Distributed.numProcesses > 1:
                if auto_distributed_partitioning:
                    assert not shuffle or options.exists("random_seed"), (
                        "When using auto_distributed_partitioning you must set "
                        "poptorch.Options.randomSeed() to ensure that tensors "
                        "are in the same order in all processes.")
                    assert self._combined_batch_size is not None, (
                        "batch_size=None not allowed when using "
                        "auto_distributed_partitioning.")

                    dataset = _SubDataset(dataset, options,
                                          self._combined_batch_size, drop_last)
                    if shuffle:
                        # In a distributed environment we handle the shuffling
                        # ourselves (take a look at _SubDataset and __iter__)
                        # so no need for parent class to shuffle within each of
                        # the subsets again.
                        self._shuffle_map_style_data_in_distributed_env = True
                        shuffle = False
        if not self._is_iterable:
            dataset = profiling.Channel("dataset").instrument(
                dataset, "__getitem__")

        rebatched_size = None
        dataset_batch_size = 1 if self._is_user_batch_sampler_set \
                                else self._combined_batch_size

        if self._is_user_batch_sampler_set:
            real_drop_last = self.batch_sampler_drop_last
        else:
            real_drop_last = drop_last
        cbs_is_gt_one = self._combined_batch_size is not None and \
            self._combined_batch_size > 1
        async_mode_with_remainder = mode == DataLoaderMode.Async and \
            not real_drop_last and cbs_is_gt_one
        if mode == DataLoaderMode.AsyncRebatched or async_mode_with_remainder:
            mode = DataLoaderMode.Async
            rebatched_size = self._combined_batch_size
            # When we rebatch: always let the worker process handle the
            # leftovers instead of the Dataloader.
            self.rebatched_drop_last = drop_last
            drop_last = False
            if rebatched_worker_size is not None:
                assert rebatched_worker_size <= self._combined_batch_size, (
                    f"The rebatched_worker_size ({rebatched_worker_size})"
                    " must be <= to the combined batch size ("
                    f"{self._combined_batch_size})")
                dataset_batch_size = rebatched_worker_size

        super().__init__(dataset,
                         batch_size=dataset_batch_size,
                         shuffle=shuffle,
                         batch_sampler=batch_sampler,
                         num_workers=num_workers,
                         drop_last=drop_last,
                         persistent_workers=persistent_workers,
                         **kwargs)

        if mode == DataLoaderMode.Async:
            async_options = async_options or {}
            assert "rebatched_size" not in async_options, (
                "You cannot "
                "use DataLoaderMode.AsyncRebatched and manually specify"
                " the rebatched_size in async_options")
            self._accessor = AsynchronousDataAccessor(
                self, **async_options, rebatched_size=rebatched_size)

    def __len__(self) -> int:
        # If we're rebatching in the AsynchronousDataAccessor we need to
        # adjust the dataset's length.
        if self._accessor is not None and self._accessor.rebatched_size:
            num_elts = len(self.dataset)
            dataset_len = num_elts // self._accessor.rebatched_size
            if not self.rebatched_drop_last and \
                    num_elts % self._accessor.rebatched_size:
                # Round up
                dataset_len += 1
        else:
            dataset_len = super().__len__()
        return dataset_len

    @property
    def _profiling(self):
        return profiling.Channel("poptorch.DataLoader")

    @property
    def combinedBatchSize(self) -> Optional[int]:
        """Total number of elements consumed from the dataset for a single
        execution of the model."""
        return self._combined_batch_size

    @property
    def options(self) -> 'poptorch.Options':
        """A reference to the options that were used to initialise this
           instance.
        """
        return self._options

    def terminate(self) -> None:
        """If `mode==DataLoaderMode.Async`, kills the worker process in the
        underlying :py:class:`~poptorch.AsynchronousDataAccessor` manually,
        otherwise has no effect.
        """
        if self._accessor is not None:
            self._accessor.terminate()

    def __del__(self) -> None:
        self.terminate()

    def __iter__(self) -> "torch.utils.data.dataloader._BaseDataLoaderIter":
        if self._shuffle_map_style_data_in_distributed_env:
            self.dataset.shuffle_global_indices()
            self.dataset.swap_range()
        if self._accessor is not None:
            return self._accessor.__iter__()

        if self._is_user_batch_sampler_set and \
            self._num_batches_to_combine != 1:
            return _utils.combined_batch_generator(
                super().__iter__(), self._num_batches_to_combine,
                self.batch_sampler_drop_last)

        return super().__iter__()


class AsynchronousDataAccessor:
    """A data loader which launches the data loading process on a separate
    thread to allow for the data to be preprocessed asynchronous on CPU to
    minimise CPU/IPU transfer time.

    This works by loading the data into a ring buffer of shared memory.
    When the IPU needs another batch it uses the data ready in the in
    the ring buffer. The memory is shared so will be used in-place and
    won't be freed until the next batch is requested. Behind the scenes
    the worker thread will be filling the unready elements of the ring
    buffer.

    .. note:: When using a ``torch.utils.data.Dataset`` with ``rebatched_size``
        the accessor will default to ``drop_last=True``, to change that
        behaviour wrap the dataset into a
        ``poptorch.DataLoader(..., drop_last=False)``.
    """

    def __init__(
            self,
            dataset: Union['torch.utils.data.Dataset', DataLoader],
            buffer_size: int = 3,
            miss_sleep_time_in_ms: float = 0.1,
            load_indefinitely: bool = True,
            early_preload: bool = False,
            sharing_strategy: 'poptorch.SharingStrategy' = SharingStrategy.
            ForkServer,
            rebatched_size: Optional[int] = None):
        """
        :param dataset: The dataset to pull data from, this can be any Python
            iterable.
        :param buffer_size: The size of the ring buffer.
        :param miss_sleep_time_in_ms: When the buffer is full how long should
            we sleep the worker before checking again.
        :param load_indefinitely: If True when we hit the end of the dataset
            we will just loop round again.
        :param early_preload: If True, start loading data in the ring buffer
            as soon as the worker is created.
            If False, wait for an iterator to be created before loading data.
        :param sharing_strategy:
            Method to use to pass the dataset object when the child process
            is created.

            * `SharedMemory` is fast but might be quite limited in size.
            * `FileSystem` will serialise the dataset to file and reload it
              which will be slower.
            * `Fork` new processes: no data sharing required but might cause
              problems if worker processes use threading.
            * `ForkServer` is similar to `Fork` but uses a server process to
              fork child processes. It is safe to use even if worker processes
              use threading.

        :param rebatched_size: If not None: return N batched tensors from
            the dataset per iteration. (The passed dataset must have a
            batch_size of 1).

        .. note :: If dataset is an iterable-type ``poptorch.DataLoader``
            configured with ``drop_last=False`` then ``rebatched_size``
            must be used.
        """
        # Set _worker to None  in case something goes wrong and terminate is called
        self._worker = None

        # Ensure the DataLoader doesn't already have an AsynchronousDataAccessor
        if isinstance(dataset, DataLoader) and dataset._accessor is not None:
            raise createPoptorchError(
                "The DataLoader already uses an "
                "AsynchronousDataAccessor internally. Either use "
                "the existing one or set mode='poptorch.DataLoaderMode.Sync'"
                " in the DataLoader.")

        if isinstance(dataset, DataLoader) and \
           not dataset._is_user_batch_sampler_set and \
           not dataset.drop_last and \
           rebatched_size is None:
            # Otherwise we'll end up with one left over tensor per worker
            # to return to the main process and we don't currently
            # support that.
            assert dataset.combinedBatchSize is None or \
                   dataset.combinedBatchSize == 1, (
                       "The 'drop_last=False' option from the DataLoader only "
                       "works if 'rebatched_size' is specified too.")
        if rebatched_size is not None:
            assert rebatched_size > 1, ("rebatched_size"
                                        " must be None or greater than 1")

        self._dataset = dataset
        # To avoid hangs when the application exits: implicitly call terminate().
        atexit.register(self.terminate)
        self.rebatched_size = rebatched_size
        self._worker = _dataloader.AsynchronousWorker(
            buffer_size, miss_sleep_time_in_ms, dataset, load_indefinitely,
            early_preload, sharing_strategy, rebatched_size)

    def terminate(self) -> None:
        """
        An override function to kill the worker process manually.
        """
        if self._worker is not None:
            self._worker.terminate()
            self._worker = None

    def __del__(self) -> None:
        self.terminate()

    def __len__(self) -> int:
        dataset_len = len(self._dataset)
        # If this AsynchronousDataAccessor is embedded in a DataLoader then the dataset
        # length has already been adjusted.
        if self.rebatched_size and getattr(self._dataset, "_accessor",
                                           None) != self:
            num_elts = dataset_len * self._dataset.batch_size
            dataset_len = num_elts // self.rebatched_size
        return dataset_len

    def __iter__(self) -> 'poptorch.AsynchronousDataAccessor':
        assert self._worker is not None
        self._worker.resetIterator()
        return self

    def __next__(self) -> Any:
        # We return shared memory to the user so we can't tell the worker to
        # refill it until the next item is requested.
        assert self._worker is not None
        self._worker.releaseElement()
        while not self._worker.endOfFile():
            data = self._worker.acquireElementIfAvailable()
            if data is not None:
                return data
            self._worker.assertNoError()
        # EOF event
        raise StopIteration


def trainingModel(model: Union['torch.nn.Module', 'poptorch.PoplarExecutor'],
                  options: Optional['poptorch.Options'] = None,
                  optimizer: Optional['torch.optim.Optimizer'] = None
                  ) -> 'poptorch.PoplarExecutor':
    """ Create a PopTorch training model, from a PyTorch model, to run on IPU
    hardware in training mode.

    .. note:: PopTorch makes a shallow copy of the model and wraps the original
            model to facilitate weight synchronisation. Changes to the
            parameters
            in the returned training model affect the original model and vice
            versa. However, primitive variable types are not synced. For
            example calling ``model.train()`` on the original model, which
            changes the ``training`` bool of the model instance, will not alter
            the model returned by this function. You may need to call
            ``model.train()`` on your model before you call this function for
            correct behaviour.

    .. note: To restore a model use :py:meth:`~poptorch.PoplarExecutor.destroy`.
        You will need to do this first if you need to call this function again
        on the same instance.

    :param model: The PyTorch model to wrap.
    :param options: The IPU specific options
    :param optimizer: The optimizers to apply during \
        training.

        Supported PyTorch optimizers: ``optim.SGD``, ``optim.Adam``, \
             ``optim.AdamW``, ``optim.RMSprop``.

        Supported PopTorch optimizers: :py:class:`~poptorch.optim.SGD`, \
            :py:class:`~poptorch.optim.Adam`, \
            :py:class:`~poptorch.optim.AdamW`, \
            :py:class:`~poptorch.optim.RMSprop`. \
            :py:class:`~poptorch.optim.LAMB`.

    :returns: The :py:class:`~poptorch.PoplarExecutor` wrapper to use in place
        of ``model``.
    """

    if isinstance(model, PoplarExecutor):
        model = model._user_model  # pylint: disable=protected-access

    # Handle the model already being wrapped
    if _impl.isWrapped(model):
        raise RuntimeError("Model has already been wrapped in "
                           "'poptorch.trainingModel'. Call model.destroy() "
                           "on the model to unwrap before wrapping again.")

    # Create a copy of the original model in case it needs to be wrapped
    maybe_wrapped_model = copy.copy(model)

    return PoplarExecutor(model=maybe_wrapped_model,
                          options=options,
                          training=True,
                          optimizer=optimizer,
                          user_model=model,
                          poptorch_version=__version__)


def inferenceModel(model: Union['torch.nn.Module', 'poptorch.PoplarExecutor'],
                   options: Optional['poptorch.Options'] = None
                   ) -> 'poptorch.PoplarExecutor':
    """Create a PopTorch inference model, from a PyTorch model, to run on IPU
    hardware in inference mode.

    .. note:: PopTorch makes a shallow copy of the model. Changes to the
        parameters in the returned inference model affect the original model
        and vice versa. However, primitive variable types are not synced: for
        example calling ``model.eval()`` on the original model will not alter
        the model returned by this function. You may need to call
        ``model.eval()`` on your model before you call this function for correct
        behaviour.

    :param model: The PyTorch model to wrap.
    :param options: The IPU specific options
    :returns: The :py:class:`~poptorch.PoplarExecutor` wrapper to use in place
        of ``model``.
    """

    if isinstance(model, PoplarExecutor):
        model = model._user_model  # pylint: disable=protected-access

    return PoplarExecutor(model=copy.copy(model),
                          options=options,
                          training=False,
                          poptorch_version=__version__)


def ipuHardwareIsAvailable(num_ipus: int = 1) -> bool:
    """Indicates whether any IPU hardware with `num_ipus` is present in the system.

    Note: This function doesn't check if the IPU is free or already being used.

    :param num_ipus: The number of IPUs required.
    :returns: True if physical IPUs are available, False otherwise.
    """
    return poptorch_core.ipuHardwareVersion(num_ipus) != 0


def ipuHardwareVersion() -> int:
    """Indicates what IPU hardware version is available in the system.

    Raise an exception if no hardware is available.

    :returns: The IPU hardware version or -1 if unknown.
    """
    version = poptorch_core.ipuHardwareVersion()
    assert version != 0, "No IPU hardware available on this system"
    return version


def setLogLevel(level: Union[str, int]):
    """Changes the volume of messages printed in the console (stdout)

    :param level:
        * TRACE: Print all messages.
        * DEBUG: Print debug messages and above.
        * INFO: Print info messages and above.
        * WARN: Print warnings and errors.
        * ERR:  Print errors only.
        * OFF:  Print nothing.
    """
    _logging.setLogLevel(level)


# Hack so that print() works for static graphs: we can print the device, shape, etc.
# but we print "<unavailable>" instead of trying to retrieve the content of the tensor.
_real_tensor_str = torch._tensor_str._tensor_str  # pylint: disable=protected-access


def _tensor_str(self, indent):
    if self.device.type == "ipu":
        return "<unavailable>"
    return _real_tensor_str(self, indent)


torch._tensor_str._tensor_str = _tensor_str  # pylint: disable=protected-access


class ICustomArgParser(abc.ABC):
    """Interface to create custom argument parsers to extract tensors and
    rebuild custom object types."""

    @abc.abstractmethod
    def yieldTensors(self, struct) -> None:
        """yield every single torch.Tensor contained in your
        object in a deterministic order.

        For example:
        >>> self.a = { "t0": torch.Tensor(), "t1": torch.Tensor()}
        ...
        >>> def yieldTensors(self, struct):
        >>>    for k in sorted(struct.a.keys()):
        >>>        yield struct.a[k]
        """

    @abc.abstractmethod
    def reconstruct(self, original_structure, tensor_iterator) -> Any:
        """Create a new structure based on original_structure but
        using tensors from the provided iterator in the same deterministic
        order as in yieldTensors().

        For example:
        >>> self.a = { "t0": torch.Tensor(), "t1": torch.Tensor()}
        ...
        >>> def reconstruct(self, original_struct, tensor_iterator):
        >>>    out = type(original_struct)()
        >>>    for k in sorted(original_struct.a.keys()):
        >>>        out.a[k] = next(tensor_iterator)
        >>>    return out

        .. important:: Only IPU tensors should be dequeued from the tensor
                       iterator (not CPU tensors or other object types),
        """


def registerCustomArgParser(arg_data_type: Type,
                            arg_parser: Optional[ICustomArgParser]):
    """Register an argument parser for a custom argument type.

    If a custom parser is already registered for this data type, it will be
    replaced.
    If arg_parser is None, then the current custom parser, if there is one,
    will be deleted.
    """
    if arg_parser is None:
        if arg_data_type in _utils.custom_arg_parsers:
            del _utils.custom_arg_parsers[arg_data_type]
    else:
        if not isinstance(arg_parser, ICustomArgParser):
            raise createPoptorchError(
                "arg_parser must inherit from ICustomArgParser")
        _utils.custom_arg_parsers[arg_data_type] = arg_parser


def registerGeometricCustomArgParsers():
    # Try registering pyg's custom arg parsers.
    # If anything goes wrong with import of pyg, then silently ignore it,
    # so that poptorch importers that don't need pyg do not needlessly fail.
    try:
        poptorch_geometric_spec = importlib.util.find_spec(
            "poptorch_geometric")
        if poptorch_geometric_spec is None:
            return

        types_spec = importlib.util.find_spec("poptorch_geometric.types")

        if types_spec is not None and types_spec.loader is not None:
            types = types_spec.loader.load_module()
            types.registerCustomArgParsers()
    except ImportError:
        pass


registerGeometricCustomArgParsers()

from ._poplar_executor import PoplarExecutor  # pylint: disable=reimported, wrong-import-position


================================================
FILE: python/_args_parser.py
================================================
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.
import copy
import inspect
from typing import Any, Dict
import torch

# Do not import any poptorch.* here: it will break the poptorch module
from . import _impl
from ._logging import logger
from . import _utils


class ArgsParser:
    class Args:
        def __init__(self):
            self._args = []
            self._arg_names = []
            self._kwargs = {}
            self.first_none = None

        @property
        def args(self):
            return self._args

        @property
        def arg_names(self):
            return self._arg_names

        @property
        def kwargs(self):
            return self._kwargs

        def appendArg(self, arg, name):
            self._args.append(arg)
            self._arg_names.append(name)

        def setNamedArg(self, name, arg):
            self._kwargs[name] = arg

        def popArg(self):
            self._args.pop()
            self._arg_names.pop()

        def clone(self):
            # pylint: disable=protected-access
            clone = ArgsParser.Args()
            clone._args = copy.copy(self._args)
            clone._arg_names = copy.copy(self._arg_names)
            clone._kwargs = copy.copy(self._kwargs)
            clone.first_none = self.first_none
            return clone

        def _forEach(self, data, fn):
            tensors = _utils.flattenTensorStructure(data)
            return _utils.reconstructTensorStructure(
                data, [fn(tensor) for tensor in tensors])

        def validateInputs(self, inputs):
            end = (
                "\nThis error occurred because the inputs passed at runtime"
                " don't match the inputs used to compile the model.\n"
                "To recompile the model for the new inputs create a new "
                "inferenceModel / trainingModel wrapper or call destroy() on "
                "the curent one and try again.")
            if len(inputs.args) != len(self.args):
                raise _impl.createPoptorchError(
                    "Number of positional arguments mismatch: expected "
                    f"{len(self.args)} arguments but got "
                    f"{len(inputs.args)}.{end}")

            def validate(name, compiled, input, are_named_args=False):
                ctype = type(compiled)
                itype = type(input)
                if ctype != itype:
                    raise _impl.createPoptorchError(
                        f"Type mismatch for {name}: expected "
                        f"{ctype} but got {itype}.{end}")
                if isinstance(compiled, tuple):
                    clen = len(compiled)
                    ilen = len(input)
                    if clen != ilen:
                        raise _impl.createPoptorchError(
                            f"Length mismatch for {name}: "
                            f"expected {clen} elements but got {ilen}.{end}")
                    for i, c in enumerate(compiled):
                        validate(name + f"[{i}]", c, input[i])
                elif isinstance(compiled, dict):
                    expected = set(compiled.keys())
                    provided = set(input.keys())
                    if expected != provided:
                        extra = provided - expected
                        details = []
                        if extra:
                            details.append("Unexpected arguments: " +
                                           ", ".join(sorted(extra)))
                        missing = expected - provided
                        if missing:
                            details.append("Missing arguments: " +
                                           ", ".join(sorted(missing)))
                        raise _impl.createPoptorchError(
                            f"Keys mismatch for {name}: "
                            f"{'. '.join(details)}.{end}")
                    for k, v in compiled.items():
                        if are_named_args:
                            n = k
                        else:
                            n = f"{name}[{k}]"
                        validate(n, v, input[k])

                elif isinstance(compiled, torch.Tensor):
                    if compiled.dtype != input.dtype:
                        raise _impl.createPoptorchError(
                            "Data type "
                            f"mismatch for {name}: expected {compiled.dtype} "
                            f"but got {input.dtype}.{end}")
                    if compiled.shape != input.shape:
                        raise _impl.createPoptorchError(
                            "Shape "
                            f"mismatch for {name}: expected {compiled.shape} "
                            f"but got {input.shape}.{end}")
                else:
                    # If we've got a custom parser then we'll be able to extract
                    # the tensors and validate them as a list.
                    compiled_tensors = _utils.flattenTensorStructure(compiled)
                    if compiled_tensors:
                        input_tensors = _utils.flattenTensorStructure(input)
                        validate(name, tuple(compiled_tensors),
                                 tuple(input_tensors))
                    elif compiled != input:
                        # Other types are compiled in the graph (scalars, etc) and
                        # therefore should be an exact match to the value used to
                        # compile the model.
                        raise _impl.createPoptorchError(
                            f"Value mismatch for {name}: "
                            f"expected {compiled} but got {input}.{end}")

            for i, arg in enumerate(self.args):
                validate(self.arg_names[i], arg, inputs.args[i])

            validate("named arguments",
                     self.kwargs,
                     inputs.kwargs,
                     are_named_args=True)

        def forEachTensorMatchedAtLeastOnce(self, condition, doOnTrue=None):
            matches = [False]

            def fn(t):
                if condition(t):
                    matches[0] = True
                    if doOnTrue is not None:
                        return doOnTrue(t)
                return t

            self.forEach(fn)
            return matches[0]

        def forEach(self, fn):
            self._args = self._forEach(self._args, fn)
            self._kwargs = self._forEach(self._kwargs, fn)

        def asPackedFlatTuple(self, canonical_args=None):
            # Remove all the non torch.tensor types and flatten
            # any data structure.
            cargs = None if canonical_args is None else canonical_args.args
            ckwargs = None if canonical_args is None else canonical_args.kwargs
            return tuple(
                _utils.flattenTensorStructure(self._args, cargs) +
                _utils.flattenTensorStructure(self._kwargs, ckwargs))

    def __init__(self, model: Any):
        # Combine args and kwargs:
        if isinstance(model, _impl.OptimizerWrapper):
            sig = inspect.signature(model.model.forward)
        elif isinstance(model, torch.nn.Module):
            sig = inspect.signature(model.forward)
        elif callable(model):
            try:
                sig = inspect.signature(model)
            except ValueError:
                # ValueError: no signature found for builtin ...
                # If the callable is a Cython function then its signature
                # might not be available (E.g torch.nn.functional.logsigmoid)
                sig = None
        else:
            raise TypeError("Expected a torch.nn.Module or a callable")
        if sig is None:
            # If we couldn't extract the function's signature: be flexible
            # and default to "*args, **kwargs"
            self._varnames = ["args", "kwargs"]
            self._var_kinds = [
                inspect.Parameter.VAR_POSITIONAL, inspect.Parameter.VAR_KEYWORD
            ]
            self._defaults = {}
            self._has_variadic_arguments = True
        else:
            self._var_kinds = [p.kind for p in sig.parameters.values()]
            self._has_variadic_arguments = any(kind in [
                inspect.Parameter.VAR_POSITIONAL, inspect.Parameter.VAR_KEYWORD
            ] for kind in self._var_kinds)
            self._varnames = list(sig.parameters.keys())
            self._defaults = {
                name: p.default
                for name, p in sig.parameters.items()
                if p.default != inspect.Parameter.empty
            }

        self._warned_not_contiguous_input = False

    def __call__(self,
                 args: Any,
                 kwargs: Dict[str, Any],
                 fast_path: bool = False) -> Args:
        """Checks the inputs are of a supported type. Inputs must be
           tensors or tuples/lists of tensors. Will convert list to tuples
           as we can't natively support lists in the JIT.
        """
        in_tensors = ArgsParser.Args()
        assert self._has_variadic_arguments or len(args) + len(kwargs) <= len(
            self._varnames), ("Too many arguments provided: expected %s (%d) "
                              "but got %d") % (self._varnames,
                                               len(self._varnames),
                                               len(args) + len(kwargs))
        # Make sure all the arguments provided are allowed.
        if not self._has_variadic_arguments:
            for k in kwargs.keys():
                assert k in self._varnames, (
                    f"{k} is not a valid parameter."
                    f"Allowed values are {self._varnames}")

        variadic_pos_set = False
        for i, name in enumerate(self._varnames):
            is_variadic_pos = self._var_kinds[
                i] == inspect.Parameter.VAR_POSITIONAL
            is_variadic_keyword = self._var_kinds[
                i] == inspect.Parameter.VAR_KEYWORD

            if is_variadic_keyword:
                # A variadic keyword argument will consume all the remaining
                # kwargs
                used_names = self._varnames[:i]
                for k, v in kwargs.items():
                    if k not in used_names:
                        in_tensors.setNamedArg(k, v)
            elif i < len(args) or is_variadic_pos:
                # If it's a variadic parameter: consume all the remaining args
                # otherwise consume only one.
                if is_variadic_pos:
                    variadic_pos_set = True
                    a = args[i:]
                    # Clear args: all the arguments have been consumed
                    args = []
                else:
                    a = [args[i]]
                for idx, arg in enumerate(a):
                    if is_variadic_pos:
                        arg_name = f"*{name}[{idx}]"
                    else:
                        arg_name = name

                    # Non fast path for compilation, fast path for executing.
                    if not fast_path:
                        self._dictCheck(arg)

                    in_tensors.appendArg(arg, arg_name)

                assert name not in kwargs, ("Parameter %s was passed more "
                                            "than once") % name
            elif name in kwargs:
                # Non fast path for compilation, fast path for executing.
                if not fast_path:
                    self._dictCheck(kwargs[name])

                # Everything after a variadic positional argument must be named
                if variadic_pos_set:
                    in_tensors.setNamedArg(name, kwargs[name])
                else:
                    in_tensors.appendArg(kwargs[name], name)
            else:
                if name not in self._defaults:
                    raise _impl.createPoptorchError("Mandatory parameter "
                                                    f"{name} missing")
                value = self._defaults[name]
                # Everything after a variadic positional argument must be named
                if variadic_pos_set:
                    in_tensors.setNamedArg(name, value)
                else:
                    in_tensors.appendArg(value, name)

        if in_tensors.forEachTensorMatchedAtLeastOnce(
                condition=lambda t: not t.is_contiguous(),
                doOnTrue=lambda t: t.contiguous()):
            if not self._warned_not_contiguous_input:
                logger.warning("At least one input tensor is not contiguous: "
                               "non-contiguous tensors will be converted.")
                self._warned_not_contiguous_input = True

        return in_tensors

    def _dictCheck(self, data):
        work = [data]
        while len(work) > 0:
            d = work.pop()
            if isinstance(d, (tuple, list)):
                work.extend(d)
            elif isinstance(d, dict):
                logger.warning("Dicts as inputs only have partial support, "
                               "they can be accessed using literal keys, but "
                               "full Python functionality is not enabled. "
                               "Consider changing dict inputs to tuple.")
                return


================================================
FILE: python/_dataloader.py
================================================
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.

import io
import signal
import sys
import os
import tempfile
import enum
import math
import pickle
import time
import torch
import torch.multiprocessing as multiprocessing

# Do not import any poptorch.* here: it will break the poptorch module
from . import enums
from ._logging import logger
from . import _impl
from ._utils import custom_arg_parsers, getCustomParser, reconstructTensorStructure


class AsynchronousWorker:
    """Interface for the host to create and manage a separate worker process to fetch elements from a dataset."""

    def __init__(self, buffer_size, miss_sleep_time_in_ms, dataset,
                 load_indefinitely, early_preload, sharing_strategy,
                 rebatched_size):
        self._process = _AsynchronousWorkerProcess(
            buffer_size, miss_sleep_time_in_ms, dataset, load_indefinitely,
            early_preload, sharing_strategy, rebatched_size)
        self._was_used = False
        self._worker_started = False

        # Keep end of file events in a special buffer shared between worker and device. This is due to the worker reseting automatically.
        (self._command_pipe, self._is_single_tensor, self._dict_keys,
         self._data_type_obj, self._eof,
         self._data_buffers) = self._process.start()

    def terminate(self):
        if self._process.isAlive():
            self._requestShutdown()

        self._process.join()

    def resetIterator(self):
        if self._worker_started and not self._was_used:
            # The current iterator hasn't been used: nothing to do.
            return

        # Reset if:
        # - The EOF was reached and the worker is waiting to know if it
        #   should create a new iterator (load_indefinitely=False)
        # - We're partway through an iteration and we want to restart.
        #
        # Note: there is a race condition where the worker reaches EOF
        # after endOfFile() returned False.
        # The consequence is that reset will be called when it wasn't
        # actually needed. (i.e it won't break anything)

        if self._was_used and (not self.endOfFile() or
                               (self.endOfFile()
                                and not self._process.load_indefinitely)):
            # Request reset:
            self._command_pipe.send(_HostCommand.ResetIterator)
            self.releaseElement()
            # Wait for the worker to acknowledge
            self._eof.waitForReset()
            self._data_buffers.reset()

        self._eof.clearFlag()
        # Let the worker know it can start loading
        self._command_pipe.send(_HostCommand.StartIterating)
        self._was_used = False
        self._worker_started = True

    def dataIsAvailable(self):
        return self._data_buffers.isAvailable()

    def endOfFile(self):
        return self._eof.isEofIndex(self._data_buffers.currentIndex())

    def acquireElementIfAvailable(self):
        assert not self._data_buffers.hasLock(), (
            "The current element "
            "must be released by calling releaseElement() before trying to "
            "acquire a new one")

        # Important: eof must be checked **after** dataIsAvailable.
        #
        # The worker does:
        # 1. setEOFflag()
        # 2. if load_indefinitely -> start prefetching the next iteration.
        # 3. mark data as available.
        #
        # So in the consumer / reader we need to check the flags in reverse
        # order otherwise there is a risk that eof will be False, then by
        # the time data is checked both eof and data are now True but
        # we'll miss eof and iterate over the ring buffer an extra time.
        if not self.dataIsAvailable() or self.endOfFile():
            return None
        left_over = self._eof.leftOver(self._data_buffers.currentIndex())
        # Pull and lock the ready buffer.
        data = self._data_buffers.lock()
        self._was_used = True

        if left_over > 0:
            data = [d.narrow(0, 0, left_over) for d in data]
            # Update the EOF flag to the real index and clear the
            # left over value.
            self._eof.setFlag(self._data_buffers.currentIndex())

        # The worker process always sends us a tuple of tensors, however
        # the user data can actually be either:
        # - A list
        # - A single tensor
        # - A dictionary string -> Tensor

        # If it's a single tensor: return the first element of the list.
        if self._is_single_tensor:
            return data[0]
        if self._dict_keys:
            # If it's a dictionary: associate the data to the keys here.
            return dict(zip(self._dict_keys, data))
        if self._data_type_obj:
            # If it's a custom object type: reconstruct it using the ArgParser
            return reconstructTensorStructure(self._data_type_obj, data)

        # Else return the list as is.
        return data

    def assertNoError(self):
        if not self._process.isAlive():
            assert self._process.exitCode() == 0, \
                "An error occurred in the data fetcher"

    def releaseElement(self):
        # Set the previous iteration to false so it can be pulled in now
        # avoiding any data races.
        self._data_buffers.unlockIfLocked()

    def _requestShutdown(self):
        # Send the exit signal if the worker is still alive.
        try:
            self._command_pipe.send(_HostCommand.Shutdown)
        except BrokenPipeError:
            pass


class _AsynchronousWorkerProcess:
    """Worker process fetching elements from a given dataset"""

    def __init__(self, buffer_size, miss_sleep_time_in_ms, dataset,
                 load_indefinitely, early_preload, sharing_strategy,
                 rebatched_size):
        self._buffer_size = buffer_size
        self._miss_sleep_time_in_ms = miss_sleep_time_in_ms
        self._dataset = dataset
        self.load_indefinitely = load_indefinitely
        self._early_preload = early_preload
        self._process = None
        self._sharing_strategy = sharing_strategy
        self._rebatched_size = rebatched_size
        self._next_batch_idx = 0

    def isAlive(self):
        return self._process.exitcode is None

    def exitCode(self):
        return self._process.exitcode

    def join(self):
        self._process.join(timeout=10)
        # If the asynchronous worker process is blocked waiting for the dataset
        # to process the next batch it will not be able to respond to host
        # command handler's shutdown_now command. We try stopping it by sending
        # a SIGINT signal first and choose SIGTERM as the last resort.
        if self.isAlive():
            os.kill(self._process.pid, signal.SIGINT)
            self._process.join(timeout=10)
        if self.isAlive():
            self._process.terminate()
            self._process.join()

    def start(self):
        # The dataset might not fit in shared memory: so use the file system instead.
        if self._sharing_strategy != enums.SharingStrategy.FileSystem:
            return self._start()

        # Serialise the dataset to file and replace the dataset by the filename.
        with tempfile.TemporaryDirectory() as d:
            pickle_file = os.path.join(d, "dataset.pkl")
            logger.debug("Serialising dataset to file: %s", pickle_file)
            dataset = self._dataset
            with open(pickle_file, "wb") as f:
                pickle.dump(self._dataset, f, protocol=4)
                self._dataset = pickle_file
            try:
                return self._start()
            finally:
                self._dataset = dataset

    def _start(self):
        assert self._process is None, "Worker already started"
        # We use a small pipe to get the initial data. The latency of
        # deserialising the python data is too high to be used for the
        # actual fetch so we just use this to return the initial buffers
        # in shared memory which will be used for the actual read/write
        # in the hot loop.
        if self._sharing_strategy == enums.SharingStrategy.Fork:
            ctx = multiprocessing.get_context('fork')
        elif self._sharing_strategy == enums.SharingStrategy.ForkServer:
            ctx = multiprocessing.get_context('forkserver')
        else:
            ctx = multiprocessing.get_context('spawn')
        read_data_pipe, write_data_pipe = ctx.Pipe(duplex=False)

        # If the worker exits before the parent process is done
        # setting up the _data_buffers then the pipe will get freed
        # and bad things will happen.
        read_command_pipe, write_command_pipe = ctx.Pipe(duplex=False)

        # Fetch the data on a seperate process.
        logger.debug("AsynchronousDataAccessor parent process: %d",
                     os.getpid())

        self._process = ctx.Process(target=self._mainLoop,
                                    args=(write_data_pipe, read_command_pipe))
        self._process.start()
        write_data_pipe.close()
        read_command_pipe.close()

        try:
            indices_mem = read_data_pipe.recv()
            data_len = read_data_pipe.recv()
            is_single_tensor = read_data_pipe.recv()
            dict_keys = read_data_pipe.recv()
            data_type_obj = read_data_pipe.recv()
            eof_mem = read_data_pipe.recv()
            buffers = _DataRingBufferReader(self._buffer_size, data_len,
                                            indices_mem)

            for data_idx in range(0, data_len):
                # Get the buffer from the host.
                buffer = read_data_pipe.recv()
                buffers.setBuffer(buffer, data_idx)

            # We're all set: let the worker know.
            write_command_pipe.send(_HostCommand.SetupComplete)
            return (write_command_pipe, is_single_tensor, dict_keys,
                    data_type_obj, _EndOfFileFlag(eof_mem), buffers)
        except EOFError:
            pass
        # Exit the except block before raising a cleaner exception otherwise the previous one will not be cleared.
        raise _impl.createPoptorchError(
            "AsynchronousDataAccessor worker thread failed to start "
            "(Check above for details)")

    def _mainLoop(self, conn, command_pipe):  # pylint: disable=inconsistent-return-statements
        """Main event loop of the asynchronous worker process

        SIGINT signals appear as KeyboardInterrupts and need to be handled
        as the ``atexit`` terminate hook is not guaranteed to be called before
        the signal is propagated to the worker processes.

        See Also:
            :meth:`_mainLoopNoInterrupt` for the implementation of worker event
            loop.
        """
        try:
            return self._mainLoopNoInterrupt(conn, command_pipe)
        except KeyboardInterrupt:
            # Core interpretter libraries may already be unloaded
            # so don't do anything. More detail of caveats in the
            # pytorch note on [ Data Loader Multiprocessing Shutdown Logic ]:
            # https://github.com/pytorch/pytorch/blob/
            # aa7da7b09c4a3f972ede5fd8ad0cbc8c13498a00/
            # torch/utils/data/dataloader.py#L570
            pass

    def _mainLoopNoInterrupt(self, conn, command_pipe):  # pylint: disable=too-many-statements
        # Make sure this process's output gets printed (In case of error)
        sys.stdout = io.TextIOWrapper(open(sys.stdout.fileno(), 'wb', 0),
                                      write_through=True)
        sys.stderr = io.TextIOWrapper(open(sys.stderr.fileno(), 'wb', 0),
                                      write_through=True)

        # We're in a new process: we need to re-initialise the logger
        from ._logging import logger  # pylint: disable=import-outside-toplevel
        logger.debug("AsynchronousDataAccessor worker process: %d",
                     os.getpid())
        # If the dataset is a string then it's a path to file containing
        # the dataset
        if isinstance(self._dataset, str):
            with open(self._dataset, "rb") as f:
                self._dataset = pickle.load(f)
        dataset_iterator = iter(self._dataset)
        rebatched_drop_last = getattr(self._dataset, "rebatched_drop_last",
                                      True)

        data = None
        try:
            data = next(dataset_iterator)
        except StopIteration:
            pass
        if data is None:
            raise _impl.createPoptorchError("The Dataset is empty")

        # We support either a single tensor or a flat 1D iterable of tensors.
        is_single_tensor = False
        dict_keys = []
        data_type_obj = None
        if isinstance(data, torch.Tensor):
            is_single_tensor = True
            data = (data, )
        elif isinstance(data, dict):
            # If the data is a dictionary the keys must
            # be the same for each instance returned by the dataloader so save
            # the list of keys here.
            dict_keys = list(data.keys())
            data = tuple(data[k] for k in dict_keys)
        elif type(data) in custom_arg_parsers.keys():
            # If the Dataset stores objects of a custom type, we do need to use
            # ArgsParser to reconstruct the object from the list of
            # torch.Tensors. To do so the ArgsParser uses an original object
            # (as a template) to properly setup the Dataloader output fields.
            # We assume, that all the batches of objects generated by the
            # Dataloader will have the same sets of fields. Thanks to this,
            # we can prepare an empty copy of the object (keeping all its
            # attributes) and pass it to the _AsynchronousWorkerProcess caller,
            # so that the data type can get reconstructed.
            parser = getCustomParser(data)
            data_iter = iter(type(t)() for t in parser.yieldTensors(data))
            data_type_obj = parser.reconstruct(data, data_iter)

            # Tell the host how many tensors we will be sending...
            data_length = sum(1 for i in data.__dict__.values()
                              if isinstance(i, torch.Tensor))
            # ...before the custom type data gets replaced with a generator.
            data = parser.yieldTensors(data)

        # Tell the host how many tensors we will be sending.
        if data_type_obj is None:
            data_length = len(data)

        buffers = _DataRingBufferWriter(self._buffer_size, data_length)
        # We communicate with the host via an array of sentinel values to say
        # if the data is ready as this has much better latency than queue or
        # lock approaches.
        conn.send(buffers.indices_mem)
        conn.send(data_length)
        conn.send(is_single_tensor)
        conn.send(dict_keys)
        conn.send(data_type_obj)

        eof = _EndOfFileFlag(eof_mem=None)
        conn.send(eof.eof_mem)

        # Send the tensors to the host.
        for index, tensor in enumerate(data):
            assert isinstance(
                tensor,
                torch.Tensor), (f"Tensor at index {index} is not a torch "
                                f"tensor ({type(tensor)})."
                                " AsynchronousDataAccessor expects data to "
                                "be organised as a flat 1D container of "
                                "tensors.")

            # Shared with parent process.
            tensor_size = [*tensor.size()]
            if self._rebatched_size:
                self._next_batch_idx = tensor_size[0]
                # Reshape with repeat if expand is not working in batch dimension
                if tensor_size[0] != self._rebatched_size:
                    repeat_count = math.ceil(self._rebatched_size /
                                             tensor_size[0])
                    # Repeat then shrink to the right size
                    tensor = tensor.repeat(
                        repeat_count,
                        *[1] * (len(tensor_size) - 1))[:self._rebatched_size]
                    tensor_size[0] = self._rebatched_size
            memory = tensor.expand(
                self._buffer_size,
                *tensor_size).clone().contiguous().share_memory_()

            buffers.setBuffer(memory, index)
            # Send it to the host.
            conn.send(memory)

        # We've loaded the first element as part of the spin up process.
        if self._rebatched_size is None or \
                self._next_batch_idx == self._rebatched_size:
            self._next_batch_idx = 0
            # Tell the host this data is ready.
            buffers.markWriteComplete()

        host_handler = _HostCommandHandler(command_pipe)

        if self._early_preload:
            state = _WorkerState.Prefetching
        else:
            state = _WorkerState.Stopped

        rebatch_leftover = []
        while not host_handler.shutdown_now:
            # Check for messages from the parent process:
            host_handler.checkMessages()
            if state == _WorkerState.Stopped:
                if host_handler.waitUntilStartIteration():
                    state = _WorkerState.Loading
                # else reset or shutdown received: fallthrough
            elif state == _WorkerState.Prefetching and \
                    host_handler.startIteratingPending():
                # The host sent a request to start loading so transition from prefetching
                # to loading.
                state = _WorkerState.Loading
            if host_handler.shutdown_now:
                continue
            if host_handler.resetIteratorPending():
                logger.debug("AsynchronousDataAccessor worker: reset command "
                             "received. Creating a new iterator")
                buffers.reset()
                dataset_iterator = iter(self._dataset)
                self._next_batch_idx = 0
                rebatch_leftover = []

                # Let the host know everything has been reset
                eof.setResetFlag()

                # Wait for the host to ask for the new iteration to start
                if not host_handler.waitUntilStartIteration():
                    continue  # received a shutdown command

                logger.debug("AsynchronousDataAccessor worker: the iterator "
                             "has been reset")
                state = _WorkerState.Loading

            # We're now guaranteed to be either loading or prefetching
            eof_reached = False
            # Handle the left overs if any before asking for more data.
            if rebatch_leftover:
                data = rebatch_leftover
                rebatch_leftover = []
            else:
                try:
                    # Retrieve data from the dataset
                    data = next(dataset_iterator)
                    if isinstance(data, torch.Tensor):
                        data = (data, )
                    elif isinstance(data, dict):
                        # If the data is a dictionary: we expect the keys to
                        # be strings and always the same, and the values to
                        # all be tensors. As a result we only need to pass
                        # the tensors as a tuple to the main process and
                        # re-assemble the dictionary there.
                        assert len(data) == len(dict_keys)
                        data = tuple(data[k] for k in dict_keys)
                    elif type(data) in custom_arg_parsers.keys():
                        parser = getCustomParser(data)
                        data = parser.yieldTensors(data)

                except StopIteration:
                    logger.debug(
                        "AsynchronousDataAccessor worker: end of dataset"
                        " reached")
                    eof_reached = True

            # Wait for a writing slot to become available
            while not buffers.isAvailable(
            ) and not host_handler.priorityCommandWaiting():
                # (Briefly) sleep the thread if we neither is True.
                if self._miss_sleep_time_in_ms > 0.0:
                    time.sleep(self._miss_sleep_time_in_ms / 1000.0)
                host_handler.checkMessages()
            if host_handler.priorityCommandWaiting():
                continue

            if eof_reached:
                # Note: it's important to have a writing slot before signalling
                # the end of the dataset or we might encounter the case where
                # the whole ring buffer is ready to read:
                # [ True, True, True]
                # At that point the read and write indices point at the same
                # index so if we set the EOF as the current write index then
                # the consumer will discard the whole ring buffer instead of
                # consuming the ready to read elements first.
                # Having a writing slot available ensures the read and write
                # indices never match (Even though the slot might not be used).

                # If we reach the EOF before the host asked us to start loading,
                # wait here to avoid potentially overwriting a pending
                # EOF event.
                if state == _WorkerState.Prefetching:
                    if not host_handler.waitUntilStartIteration():
                        continue  # reset or shutdown

                if self._rebatched_size and not rebatched_drop_last \
                    and self._next_batch_idx != 0:
                    eof.setFlag(buffers.currentIndex(), self._next_batch_idx)
                    # We're in the middle of a rebatch so the buffer
                    # should already be available from previous
                    # batch indices.
                    assert buffers.isAvailable()
                    buffers.markWriteComplete()
                else:
                    eof.setFlag(buffers.currentIndex(), 0)

                # If we are not to load indefinitely we wait for the host
                # to explicitly ask for a new iterator to be created.
                if not self.load_indefinitely:
                    logger.debug(
                        "AsynchronousDataAccessor worker: end of dataset"
                        " reached signalled to host: waiting for command from"
                        " host")
                    state = _WorkerState.Stopped
                    continue  # Go back to the wait for reset

                logger.debug("AsynchronousDataAccessor worker: end of dataset "
                             "reached. Creating a new iterator")
                state = _WorkerState.Prefetching

                # We reset and keep the worker thread prefetching.
                dataset_iterator = iter(self._dataset)
                self._next_batch_idx = 0

                logger.debug(
                    "AsynchronousDataAccessor worker: new iterator ready")
                continue

            # We've got a writing slot
            if self._rebatched_size:
                assert not rebatch_leftover, (
                    "Rebatch data should be empty and"
                    " ready to be used if needed")
                for index, tensor in enumerate(data):
                    # Note _index_copy_ doesn't work for FP16, it causes
                    # the following error:
                    # RuntimeError: _th_index_copy_ not supported on CPUType
                    # for Half"
                    #
                    # That's why we instead use a regular copy_
                    in_size = len(tensor)
                    out_size = self._rebatched_size - self._next_batch_idx
                    copy_size = min(in_size, out_size)
                    if in_size > out_size:
                        rebatch_leftover.append(tensor[copy_size:])

                    buffers.current[index][self._next_batch_idx:self.
                                           _next_batch_idx + copy_size].copy_(
                                               tensor[:copy_size])

                self._next_batch_idx += copy_size
            else:
                # Copy the tensor into the preallocated shared memory.
                for index, tensor in enumerate(data):
                    buffers.current[index].copy_(tensor)

            # If we're not rebatching: always notify the host an element is ready.
            # Otherwise only notify the host if the full batch is ready.
            if self._rebatched_size is None or \
                    self._next_batch_idx == self._rebatched_size:
                self._next_batch_idx = 0
                # Tell the host this data is ready.
                buffers.markWriteComplete()

        logger.debug(
            "AsynchronousDataAccessor worker: ready to exit: checking parent"
            " is ready")
        # In the unlikely event the worker is done reading the dataset
        # before the parent is done setting the buffers up: wait here.
        host_handler.waitUntilSetupComplete()
        logger.debug("AsynchronousDataAccessor worker: clean exit")


class _HostCommand(enum.IntEnum):
    SetupComplete = 0
    Shutdown = 1
    ResetIterator = 2
    StartIterating = 3


class _WorkerState(enum.IntEnum):
    Stopped = 0
    Prefetching = 1
    Loading = 2


class _HostCommandHandler:
    def __init__(self, command_pipe):
        self.pipe = command_pipe
        self.setup_complete = False
        self.shutdown_now = False
        self._reset_iterator = False
        self._start_iterating = False

    def checkMessages(self, blocking=False, ignore_setup_complete=True):
        """
        ignore_setup_complete: setup complete is usually just noise. (We only
        care about the setup being complete if we're trying trying to shutdown
        the worker process), so when asked to wait for a message, if the first
        one we receive is setup complete, usually we'll want to wait some more
        for the one we actually are interested in.
        """
        # Check for messages from the parent process:
        if self.pipe.poll() or blocking:
            cmd = self.pipe.recv()  # remove the data
            assert isinstance(cmd, _HostCommand)
            if cmd == _HostCommand.SetupComplete:
                logger.debug("SetupComplete command received")
                assert not self.setup_complete, ("More than one SetupComplete "
                                                 "event received")
                self.setup_complete = True
                if ignore_setup_complete:
                    self.checkMessages(blocking)
            elif cmd == _HostCommand.Shutdown:
                logger.debug("Shutdown command received")
                self.shutdown_now = True
            elif cmd == _HostCommand.ResetIterator:
                logger.debug("ResetIterator command received")
                self._reset_iterator = True
            elif cmd == _HostCommand.StartIterating:
                logger.debug("StartIterating command received")
                self._start_iterating = True
            else:
                raise _impl.createPoptorchError(
                    f"Unknown command received {cmd}")

    def priorityCommandWaiting(self):
        return self.shutdown_now or self._reset_iterator

    def waitUntilSetupComplete(self):
        if not self.setup_complete:
            self.checkMessages(blocking=True, ignore_setup_complete=False)
        # Shutdown has been requested: there is no other valid command the host
        # can send at that point
        assert self.setup_complete

    def startIteratingPending(self):
        """Note: returns state and reset the value to False"""
        if self._start_iterating:
            self._start_iterating = False
            return True
        return False

    def resetIteratorPending(self):
        """Note: returns state and reset the value to False"""
        if self._reset_iterator:
            self._reset_iterator = False
            return True
        return False

    def waitUntilStartIteration(self):
        """Wait until a start iteration message is received.

        Return True if we successfully received a start iteration message.
        False if it was a reset or shutdown command.
        """
        if self.priorityCommandWaiting():
            return False
        if not self._start_iterating:
            self.checkMessages(blocking=True)

        return self.startIteratingPending()


class _EndOfFileFlag:
    """
    Share a small 2 values buffer with host to signal EOF and where in ring
    buffer the event occurred.

    First value:
    -1 means no event and the worker will keep loading until EOF is
    reached or the buffer is full.

    -2 means iterator reset complete. (Will be cleared by the worker once it's
    received the start iterating command from the host)

    Any other value: wait for an iterator to be created to start
    loading more data.

    Second value: when rebatching + drop_last=False:
    Indicate the batch size of the left over tensor

    0: No left over
    0 < N < rebatch_size: left-over batch size

    """

    def __init__(self, eof_mem=None):
        if eof_mem is None:
            eof_mem = torch.tensor([-1, 0], dtype=torch.int).share_memory_()
        self.eof_mem = eof_mem

    def setResetFlag(self):
        """Called by the worker once the iterator has been reset"""
        self.eof_mem[0] = -2

    def waitForReset(self):
        while self.eof_mem[0] != -2:
            pass

    def isEofIndex(self, index):
        return self.eof_mem[0] == index and self.eof_mem[1] == 0

    def leftOver(self, index):
        """Batch size of the tensor at the end of file index.

        0 either means it's not the end of the dataset yet or
        there is no left over batches, the end of file index is empty.
        (It will contain the first element from the next iteration if a new
        iterator is created).

        N means the element at the end of file index has a reduced batch of N.
        (The first element from the next iteration if a new iterator is
        created will be located at the next index).
        """
        if self.eof_mem[0] == index:
            return self.eof_mem[1]
        return 0

    def clearFlag(self):
        self.eof_mem[1] = 0
        self.eof_mem[0] = -1

    def setFlag(self, buffer_idx, last_batch_size=0):
        """If ``last_batch_size`` is 0 then ``buffer_idx`` is the index of the
        first buffer after the end of file.

        Otherwise the buffer at ``buffer_idx`` will contain a tensor of reduced
        batch size ``last_batch_size`` elements. (Only used when drop_last=False
        and rebatched_size > 0).
        """
        # Important: eof_tensor[1] must be set before eof_tensor[0]
        # to avoid race conditions with the consumer.
        self.eof_mem[1] = last_batch_size
        self.eof_mem[0] = buffer_idx


class _RingBufferIndex:
    """The index ring buffer is a ``buffer_size`` list of booleans keeping track
    which elements from the data ring buffers is ready to be written or ready to
    be read.

    * True: ready to write
    * False: ready to read.

    It is allocated using shared memory as it is shared between the worker
    process (producer) and the main process (consumer).

    The memory for the ring buffer will be allocated by the producer (Worker
    process) and initialised to all False (i.e all ready to be written).
    """

    def __init__(self, buffer_size, indices_mem=None):
        if indices_mem is not None:
            self.buffers = indices_mem
            assert len(indices_mem) == buffer_size
        else:
            self.buffers = torch.tensor([False] * buffer_size,
                                        dtype=torch.bool).share_memory_()
        self.buffer_size = buffer_size
        self._index = 0

    def increment(self):
        self._index += 1
        if self._index >= self.buffer_size:
            self._index = 0

    def reset(self):
        self.buffers.fill_(False)
        self._index = 0

    def set(self, value):
        self.buffers[self._index] = value

    def value(self):
        return self.buffers[self._index]

    def __call__(self):
        return self._index


class _IDataRingBuffer:
    def __init__(self, buffer_size, data_len, indices_mem=None):
        self._index = _RingBufferIndex(buffer_size, indices_mem)
        D = data_len
        B = buffer_size
        assert buffer_size == self._index.buffer_size
        # The structure of the allocated buffers is
        # buffers[D][B][tensor] where:
        # D = number of tensors in one tuple from the dataset
        # B = number of buffers in ring buffer.
        #
        # but we're going to iterate over B so we will store
        # the buffers as they get added to:
        # buffers[B][D][tensor]
        self._data = [[None] * D for _ in range(B)]

    def setBuffer(self, buffer, data_idx):
        """Add a new buffer to the ring
        expecting the tensor to be of the shape
        buffer[B][tensor] but we store tensors as:
        buffers[B][D] so we need to shuffle the data.
        """
        assert len(buffer) == self._index.buffer_size
        assert data_idx < len(self._data[0])
        for d in range(self._index.buffer_size):
            self._data[d][data_idx] = buffer[d]

    @property
    def current(self):
        """Return the current buffer"""
        return self._data[self._index()]

    @property
    def indices_mem(self):
        """Return the shared memory buffer used
        to store the indices"""
        return self._index.buffers

    def currentIndex(self):
        return self._index()

    def reset(self):
        """Reset the state of the ring buffer
        (All the buffers become available to write again)"""
        self._index.reset()


class _DataRingBufferWriter(_IDataRingBuffer):
    """The writer's logic goes as follow:

        - Wait for the current slot to become available for writing
        - Fill the buffer
        - Mark the buffer as ready to be read and move to the next one.

        >>> while True:
        ...     while not buffers.isAvailable():
        ...         time.sleep()
        ...     buffers.current.copy(data)
        ...     buffers.markWriteComplete()
    """

    def markWriteComplete(self):
        """Mark the current buffer as ready to
        be read and move to the next buffer."""
        self._index.set(True)
        self._index.increment()

    def isAvailable(self):
        """Return True if the current index is available for writing,
        or False if it contains a tensor which hasn't been read by the
        consumer process yet."""
        return not bool(self._index.value())


class _DataRingBufferReader(_IDataRingBuffer):
    """The reader's logic goes as follow:

        - Wait for the current slot to become ready to read.
        - Mark the buffer as locked for reading and move to next buffer.
        - Read the locked buffer
        - Release the locked buffer

        Note: the consumer can check if the current buffer is available
        while the previous one is still locked however it cannot lock
        more than one buffer at any given time.
    """

    def __init__(self, buffer_size, data_len, indices_mem=None):
        self._locked = None
        super().__init__(buffer_size, data_len, indices_mem)

    def isAvailable(self):
        """Return True if the current buffer is ready to be read."""
        return bool(self._index.value())

    def hasLock(self):
        """Return True if the ring buffer currently has a buffer
        locked for reading."""
        return self._locked is not None

    def lock(self):
        assert self._locked is None
        self._locked = self.currentIndex()
        data = self.current
        self._index.increment()
        return data

    def unlockIfLocked(self):
        if self._locked is not None:
            self._index.buffers[self._locked] = False
        self._locked = None


================================================
FILE: python/_impl.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

from contextlib import contextmanager
import copy
import copyreg
import fcntl
import hashlib
import itertools
import os
from functools import partial, wraps
import weakref
import torch

# Do not import any poptorch.* here: it will break the poptorch module
from ._logging import logger
from . import poptorch_core
from ._utils import isOnIpu, getIpuTensorId

# A flag to tell the user if the current target is IPU. This is to allow
# divergent IPU/CPU codepaths within one model.
_is_ipu_context = False

# A flag to tell if the dispatch mechanism is used to obtain
# a graph.
_dispatch_tracing = False

# Some modules will still work even if the buffer address changes during tracing
BUFFERS_CAN_CHANGE = (
    torch.nn.BatchNorm1d,
    torch.nn.modules.batchnorm.BatchNorm1d,
    torch.nn.BatchNorm2d,
    torch.nn.modules.batchnorm.BatchNorm2d,
    torch.nn.BatchNorm3d,
    torch.nn.modules.batchnorm.BatchNorm3d,
)


class NameScopeHook:
    """ Create a name scope for each operator present in the module.
        The operator name scope will be based on the names appearing in the
        named_modules function from torch.nn.Module..
    """

    def __init__(self, module: 'torch.nn.Module'):
        self.hooks = []
        for name, m in module.named_modules():
            if len(name) > 0:
                self.hooks.append(
                    m.register_forward_pre_hook(
                        partial(self._enter_fn, name=name)))
                self.hooks.append(m.register_forward_hook(self._exit_fn))

    def _enter_fn(self, module, input, name):  # pylint: disable=unused-argument
        torch.ops.poptorch.push_name_scope(name.split(".")[-1])

    def _exit_fn(self, module, input, output):  # pylint: disable=unused-argument
        torch.ops.poptorch.pop_name_scope()

    def remove(self):
        """ Remove all existing hooks related to creating a name scope for
            operators.
        """
        for hook in self.hooks:
            hook.remove()


def createPoptorchError(msg):
    type = "poptorch_py_error"
    error = poptorch_core.Error(f"'{type}': {msg}")
    error.type = type
    error.message = msg
    error.location = ""
    return error


def isRunningOnIpu() -> bool:
    """ This function returns `True` when executing on IPU and `False` when
    executing the model outside IPU scope. This allows for separate
    code-paths to be marked in the model simply by using:

    >>> if poptorch.isRunningOnIpu():
    >>>      # IPU path
    >>> else:
    >>>     # CPU path

        Note this will only apply to code during execution. During model
        creation it will always return `False`.

        :returns: True if running on IPU, otherwise False.
    """
    global _is_ipu_context
    return _is_ipu_context


def setIpuContext(val: bool):
    global _is_ipu_context
    _is_ipu_context = val


def isDispatchTracing() -> bool:
    """ This function returns `True` when executing within the IPUScope.
    The flag is set when entering the scope and turned off when exiting.
    """
    global _dispatch_tracing
    return _dispatch_tracing


def setDispatchTracing(val: bool):
    global _dispatch_tracing
    _dispatch_tracing = val


def internal_cast(tensor, dtype):
    if dtype in [torch.float, torch.float32]:
        return torch.ops.poptorch.internal_cast(tensor, "FLOAT")

    if dtype in [torch.half, torch.float16]:
        return torch.ops.poptorch.internal_cast(tensor, "FLOAT16")

    raise ValueError(
        'Invalid poptorch.cast target type. Expecting torch.float or torch.half'
    )


def applyOptimizer(optimizer):
    num_groups = len(optimizer.param_groups)
    for index in range(0, num_groups):
        torch.ops.poptorch.optimizer_group(
            index, optimizer.param_groups[index]["params"])


# To understand which variable groups the user wants to apply the
# optimizer to we need to mark them via a wrapper. We do this because
# when we reference the variables in the context of the operation we
# get the corresponding IR value for "free" as part of the trace.
# Otherwise we would need a system to map the variable in the optimizer
# to the variable in the model to the variable in the IR.
class OptimizerWrapper(torch.nn.Module):
    def __init__(self, model, optimizer):
        super().__init__()
        self.model = model
        self.optimizer = optimizer

    def forward(self, *args, **kwargs):
        out = self.model(*args, **kwargs)
        applyOptimizer(self.optimizer)
        return out


def destroyDispatcherOnExit(func):
    """Function decorator to always destroy the dispatcher at
    the end of the wrapped function."""

    class OnExit():
        def __enter__(self):
            pass

        def __exit__(self, exc_type, value, traceback):
            poptorch_core.destroyDispatcher()

    @wraps(func)
    def wrapper(*args, **kwargs):
        with OnExit():
            return func(*args, **kwargs)

    return wrapper


@contextmanager
def distributedCacheLock(model, opts):
    """In a distributed environment we only want the model to be compiled once.

    If there is only one process or if the cache is not enabled:
        no need for a lock, early return.
    Otherwise:
        The first process to reach the lock takes it and compiles the model.
            The model will be added to the PopART cache.
        After the first process releases the lock the other ones will grab it
            one at the time and compile the model too (Except that they will
            now all hit the cache).
        The last process to grab / release the lock will delete the file.
        (Each process append a character to the file, so the position in
        the file when acquiring the lock indicates how many processes have
        already successfully compiled the model).
    """
    filename = None
    if opts.Distributed.numProcesses > 1:
        cache = opts._popart.options.get("cachePath", "")  # pylint: disable=protected-access
        if not cache:
            logger.warning(
                "Use poptorch.Options.enableExecutableCaching() to avoid "
                "compiling the model once per process")
        else:
            os.makedirs(cache, exist_ok=True)
            assert os.access(cache, os.W_OK), (f"Cache folder {cache}"
                                               " is not writable")
            filename = os.path.join(
                cache, "%s.lock" %
                hashlib.md5(repr(model).encode("utf-8")).hexdigest())

    # Not distributed mode or the cache is not enabled: do nothing.
    if not filename:
        yield False
        return

    delete_file = False
    try:
        with open(filename, "a+") as f:
            try:
                fcntl.flock(f, fcntl.LOCK_EX)
                # Add a character to the file
                f.write("0")
                logger.debug(
                    "Executable cache file locked by process %s (pos %d/%d)",
                    opts.Distributed.processId, f.tell(),
                    opts.Distributed.numProcesses)
                delete_file = f.tell() == opts.Distributed.numProcesses
                # Only the first process should compile
                yield f.tell() == 1
            finally:
                logger.debug("Process %s released the cache lock",
                             opts.Distributed.processId)
                fcntl.flock(f, fcntl.LOCK_UN)
    finally:
        if delete_file:
            os.remove(filename)


# A helper class that compares using pointer semantics rather than value
# semantics (i.e. comparing using `is` rather than eq). This is needed because
# Tensor comparison in torch returns a Tensor rather than an boolean
class WeakPtr(weakref.ref):
    __hash__ = weakref.ref.__hash__

    def __eq__(self, other):
        s = self()
        o = other()
        return self is other if s is None else s is o


# Our own dictionary with weak keys that compares keys using pointer semantics
# rather than value semantics (i.e. comparing using `is` rather than `eq`). We
# use this rather than a weakref.WeakKeyDictionary because that uses equality on
# values to compare items.
#
# Note: that we do not provide functionality for iterating over the dictionary
# since there will be issues if the cleanup function is called while iterating
class WeakKeyPtrDict:
    def __init__(self, dict=None):
        self.data = {}

        def cleanup(k, selfref=weakref.ref(self)):
            self = selfref()
            if self is not None:
                del self.data[k]

        self._cleanup = cleanup

        self.update(dict)

    def __setitem__(self, key, value):
        self.data[WeakPtr(key, self._cleanup)] = value

    def __delitem__(self, key):
        del self.data[WeakPtr(key)]

    def __getitem__(self, key):
        return self.data[WeakPtr(key)]

    def get(self, key, default=None):
        return self.data.get(WeakPtr(key), default)

    def __contains__(self, key):
        return WeakPtr(key) in self.data

    def update(self, dict=None):
        if dict is not None:
            for k, v in dict.items():
                self.__setitem__(k, v)


# The pickle handlers are called in two cases: when an object is copied
# (i.e copy.copy(obj)) or when an object is pickled / serialised.
# In both cases the object is first dumped using pickleUnwrapModel and then
# in the copy case _restoreWrapperIfNecessary() is called immediately after
# to create the new object.
#
# The _wrapper_registry keeps track of the mapping between user model, parameter,
# buffer types and their corresponding wrapper.

# When an object is copied we want to preserve the Wrapper type: the PopTorch
# wrapper doesn't contain any attribute so it's just a question of updating
# the __class__attribute.
#
# When an object is loaded from file: the wrapper type doesn't exist any more
# therefore we keep the object unwrapped. (It will be wrapped again when passed
# to poptorch.trainingModel anyway)
_wrapper_registry = WeakKeyPtrDict()
# List of all the wrapper types used by PopTorch.
_wrapper_types = []


def _restoreWrapperIfNecessary(obj):
    wrapperType = _wrapper_registry.get(obj)
    if not wrapperType is None:
        obj.__class__ = wrapperType
    return obj


def _unwrapIfWrappedAndRegister(obj):
    global _wrapper_registry
    if isWrapped(obj):
        wrapperType = obj.__class__
        obj.__class__ = obj.__class__.__bases__[0]
        _wrapper_registry[obj] = wrapperType


def _pickleUnwrapObject(obj):
    global _wrapper_registry
    wrapperType = obj.__class__
    if not wrapperType in _wrapper_types:
        raise createPoptorchError("Internal Error")

    # We need to unwrap obj before copying it because this is the function
    # registered for doing copies
    obj.__class__ = obj.__class__.__bases__[0]
    other = copy.copy(obj)
    _wrapper_registry[other] = wrapperType
    obj.__class__ = wrapperType
    return _restoreWrapperIfNecessary, (other, )


def registerWrapperType(wrapper_type):
    global _wrapper_types
    assert wrapper_type not in _wrapper_types
    _wrapper_types.append(wrapper_type)
    copyreg.pickle(wrapper_type, _pickleUnwrapObject)


def isWrapped(obj):
    global _wrapper_types
    return isinstance(obj, tuple(_wrapper_types))


def unwrapIfWrapped(obj):
    """Unwrap the model if it is wrapped, without unwrapping parameters and
       buffers."""
    if isWrapped(obj):
        obj.__class__ = obj.__class__.__bases__[0]
    return obj


def traceMethod(label):
    def decorator(func):
        @wraps(func)
        def wrapper(self, *args, **kwargs):
            with self._profiling.tracepoint(label):  # pylint: disable=protected-access
                return func(self, *args, **kwargs)

        return wrapper

    return decorator


def forEachParameterAndBuffer(model, fn):
    for module_name, module in model.named_modules():
        if isinstance(module, BUFFERS_CAN_CHANGE):
            continue

        for name, buff in module.named_buffers(prefix=module_name,
                                               recurse=False):
            fn(name, buff)

    for name, param in model.named_parameters():
        fn(name, param)


def unwrapModelIfNecessary(model: torch.nn.Module):
    """Unwrap the model, including parameter and buffer annotations and the
       model as a whole."""

    for buff in itertools.chain(model.buffers(), model.parameters()):
        _unwrapIfWrappedAndRegister(buff)

    _unwrapIfWrappedAndRegister(model)


def rewrapModelIfNecessary(model: torch.nn.Module):
    # Restores the PoptorchParameter and PoptorchBuffer annotations in the model

    for buff in itertools.chain(model.buffers(), model.parameters()):
        _restoreWrapperIfNecessary(buff)

    _restoreWrapperIfNecessary(model)


def getBufferAndParameterTensors(model):
    tensors = {}

    def fn(name, buff):
        tensors[name] = buff

    forEachParameterAndBuffer(model, fn)
    return tensors


def getBufferAndParameterAddresses(model):
    # Obtains dictionaries of the data ptr addresses of every buffer
    # and parameter

    def tensor_info(x):
        if isOnIpu(x):
            return x.device, getIpuTensorId(x)
        return x.device, x.data_ptr()

    buffer_addresses = {}
    for module_name, module in model.named_modules():
        if isinstance(module, BUFFERS_CAN_CHANGE):
            continue

        for name, buff in module.named_buffers(prefix=module_name,
                                               recurse=False):
            buffer_addresses[name] = tensor_info(buff)

    parameter_addresses = {}
    for name, param in model.named_parameters():
        parameter_addresses[name] = tensor_info(param)

    return buffer_addresses, parameter_addresses


def errorOnBufferOrParameterAddressChanges(old_addresses, new_addresses):
    # Do the buffers first then parameters
    order = ["Buffer", "Parameter"]
    for idx, dic in enumerate(old_addresses):
        for name, address in dic.items():
            if name not in new_addresses[idx]:
                err_msg = (
                    f"{order[idx]} {name} is removed from the model when"
                    " calling the forward method.")

                raise createPoptorchError(err_msg)

            if address != new_addresses[idx][name]:
                err_msg = (
                    f"{order[idx]} {name} is reassigned within the model"
                    " when calling the forward method. This is not supported. "
                    f"Consider using self.{name}.copy_(src) to copy data from "
                    "a source tensor, where src is the name of the source "
                    "tensor.")
                raise createPoptorchError(err_msg)


================================================
FILE: python/_logging.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import datetime as dt
import logging
import os
import signal
import sys
import subprocess
import traceback
import faulthandler

# Print tracebacks even when Python dies (e.g Segfault)
faulthandler.enable()
faulthandler.register(signal.SIGTERM.value, chain=True)  # pylint: disable=no-member

# Create a poptorch logger which outputs to the console INFO messages and above
logger = logging.getLogger("poptorch::python")

_LOG_LEVEL_MAPPING = {
    "TRACE": (0, logging.DEBUG),
    "TRACE_ALL": (0, logging.DEBUG),
    "DEBUG": (1, logging.DEBUG),
    "DEBUG_IR": (1, logging.DEBUG),
    "INFO": (2, logging.INFO),
    "WARN": (3, logging.WARN),
    "ERR": (4, logging.ERROR),
    "OFF": (6, logging.CRITICAL)
}

_INTERNAL_ONLY = ("TRACE_ALL", "DEBUG_IR")


def setPopartLogLevel(level):
    if not isinstance(level, str) or level not in _LOG_LEVEL_MAPPING:
        raise ValueError("Level must be one of " +
                         ", ".join(_LOG_LEVEL_MAPPING.keys()))
    # Only import poptorch_core when it's needed
    import poptorch.poptorch_core as poptorch_core  # type: ignore # pylint: disable=wrong-import-position, import-outside-toplevel
    poptorch_core.setPopartLogLevel(_LOG_LEVEL_MAPPING[level][0])


def setLogLevel(level, update_cpp=True):
    if isinstance(level, int):
        # Legacy usage
        for key in _LOG_LEVEL_MAPPING:
            if _LOG_LEVEL_MAPPING[key][0] == level:
                setLogLevel(key)
                return

        raise ValueError("Invalid log level integer")

    try:
        # Change it in C++ first
        if update_cpp:
            # Only import poptorch_core when it's needed
            import poptorch.poptorch_core as poptorch_core  # type: ignore # pylint: disable=wrong-import-position, import-outside-toplevel
            level_int = _LOG_LEVEL_MAPPING[level][0]
            poptorch_core.setLogLevel(level_int)

        # Then in python
        level_py = _LOG_LEVEL_MAPPING[level][1]
        logger.setLevel(level_py)
    except KeyError:
        error_str = "Unknown log level: " + str(level) + ". Valid values are "

        all_keys = sorted(list(_LOG_LEVEL_MAPPING.keys()))
        public_keys = [k for k in all_keys if k not in _INTERNAL_ONLY]

        for key in public_keys:
            error_str += key
            if key == public_keys[-2]:
                error_str += " and "
            elif key != public_keys[-1]:
                error_str += ", "

        raise ValueError(error_str)


setLogLevel(os.environ.get("POPTORCH_LOG_LEVEL", "WARN"), update_cpp=False)


class _PoptorchFormatter(logging.Formatter):
    BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = range(30, 38)
    RESET_COLOR = "\033[0m"
    BOLD_COLOR_SEQ = "\033[1;%dm"
    COLOR_SEQ = "\033[%dm"
    FORMATS = {
        logging.DEBUG: COLOR_SEQ % CYAN,
        logging.INFO: RESET_COLOR,
        logging.WARNING: BOLD_COLOR_SEQ % YELLOW,
        logging.ERROR: BOLD_COLOR_SEQ % RED,
        logging.CRITICAL: BOLD_COLOR_SEQ % RED,
    }

    def outputToFile(self):
        return not sys.stdout.isatty() or not sys.stderr.isatty()

    def __init__(self):
        fmt = "[%(asctime)s] [%(name)s] [%(levelname)s] %(message)s"
        # Disable the colours when the output is redirected to a file.
        if self.outputToFile():
            super().__init__(fmt)
        else:
            super().__init__("%(color)s" + fmt + self.RESET_COLOR)

    def formatTime(self, record, datefmt=None):
        ct = dt.datetime.fromtimestamp(record.created)
        if datefmt:
            s = ct.strftime(datefmt)
        else:
            t = ct.strftime("%H:%M:%S")
            s = "%s.%03d" % (t, record.msecs)
        return s

    def format(self, record):
        record.color = self.FORMATS[record.levelno]
        record.levelname = record.levelname.lower()
        return super().format(record)


def _excepthook(*args):
    e = traceback.format_exception(*args)
    extra_info = ""
    # If the exception was raised by a subprocess print its
    # stderr / stdout if available.
    if isinstance(args[1], subprocess.CalledProcessError):
        extra_info = args[1].stderr or args[1].stdout
        extra_info = "\n" + extra_info.decode("utf-8")

    if any("[closed]" in repr(h) for h in logger.handlers):
        # In some cases pytest has already closed the logger so use stderr
        # as a fallback.
        print("%s\n%s%s", e[-1], "".join(e), extra_info, file=sys.stderr)
    else:
        logger.critical("%s\n%s%s", e[-1], "".join(e), extra_info)


_console = logging.StreamHandler()
_console.setFormatter(_PoptorchFormatter())
_console.setLevel(logging.DEBUG)
logger.addHandler(_console)
sys.excepthook = _excepthook


================================================
FILE: python/_optimizer_attributes.py
================================================
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.
import enum
import numbers
import torch

# Do not import any poptorch.* here: it will break the poptorch module
from ._logging import logger
from . import optim, enums


class OptimizerAttrTracker:
    def __init__(self, opts):
        if opts._relax_optimizer_checks:
            self.log = logger.debug
        else:
            self.log = logger.warning
        self.group_attributes = []
        self.optim_attributes = []
        self.record_attributes = True
        self.printed_msgs = []
        self.type = "Unknown"

    def setType(self, optimizer_type):
        assert isinstance(optimizer_type, _OptimizerType), \
            "Unsupported optimizer type. Types supported %s" % \
            ', '.join(str(t) for t in _OptimizerType)
        self.type = optimizer_type.name

    def enableChecks(self):
        self.record_attributes = False

    def checkDefaultAttributes(self, provided):
        self._check(self.group_attributes, provided, "default group variable")

    def checkGroupAttributes(self, provided, group):
        self._check(self.group_attributes, provided,
                    f"group {group} attribute")

    def checkOptimizerAttributes(self, provided):
        self._check(self.optim_attributes, provided, "optimizer attribute")

    def _check(self, expected, provided, attr_type):
        extra = [attr for attr in provided if attr not in expected]
        if self.record_attributes:
            expected += extra
        elif extra:
            msg = f"Ignoring unexpected {attr_type} in {self.type} optimizer:"
            msg += f" {extra}"
            if msg not in self.printed_msgs:
                self.log(msg)
                self.printed_msgs.append(msg)


# pylint: disable=too-many-statements
def convertOptimizerToDict(optimizer, attr_tracker, options, is_compiled):
    optimizer_type = _toPoptorchOptimizer(optimizer)
    attr_tracker.setType(optimizer_type)

    assert optimizer_type is not None, \
        "Unsupported optimizer type. Types supported %s" % \
        ', '.join(str(t) for t in _OptimizerType)
    opt_class = _toPoptorchClass(optimizer_type)

    num_groups = len(optimizer.param_groups)
    variable_attrs = getattr(optimizer, "variable_attrs", None)

    def assertAmsgradDisabled(params):
        if params["amsgrad"]:
            raise ValueError("Only non-amsgrad "
                             "Adam/AdamW optimizers are supported.")
        return {}

    def isFloat16(type, name):
        if type not in [torch.float16, torch.float32]:
            raise ValueError(f"{name} must be set to either torch.float16"
                             " or torch.float32 not {type}")
        return type == torch.float16

    def assertRMSProp(value, name):
        if optimizer_type not in (_OptimizerType.RMSPROP,
                                  _OptimizerType.RMSPROP_CENTERED):
            raise ValueError(
                f"{name} is only available with RMSProp optimizers.")
        return value

    def ignore(_params):
        return {}

    def isAlwaysConst(_value):
        return True

    def isNeverConst(_value):
        return False

    def isNotNaN(value, name):
        if value == float("nan"):
            raise ValueError(f"{name} must not be NaN")
        return value

    # Separate attributes which can be set per group (And therefore are stored
    # in `defaults` and `param_groups`) and the ones which are global and just
    # stored as attributes of the optimizer.

    # Register all the attribute readers
    attr_readers = {
        "amsgrad": assertAmsgradDisabled,
        "bias_correction": ignore,
        "centered": ignore,
        "use_combined_accum": ignore
    }
    # Optimizer attributes: global, cannot change over time.
    #     source: opt.name
    #     format: {name: value}
    _AttrReader(attr_readers, "accum_type", _OptimizerGetter(torch.float32),
                isFloat16)
    _AttrReader(attr_readers, "velocity_accum_type",
                _OptimizerGetter(torch.float32), isFloat16)
    _AttrReader(attr_readers, "first_order_momentum_accum_type",
                _OptimizerGetter(torch.float32), isFloat16)
    _AttrReader(attr_readers, "second_order_momentum_accum_type",
                _OptimizerGetter(torch.float32), isFloat16)
    _AttrReader(attr_readers, "use_tf_variant", _OptimizerGetter(False),
                assertRMSProp)
    _AttrReader(attr_readers, "max_grad_norm", _OptimizerGetter(float("Inf")),
                isNotNaN)
    # Optimizer variables: global, can change over time.
    #     source: opt.name
    #     format: {name: (value, is_const)}

    # Set MeanReductionStrategy based on accum_type
    #     float32: Post (default)
    #     float16: Running
    if hasattr(optimizer,
               "accum_type") and optimizer.accum_type == torch.float16:
        # Only Post MeanReductionStrategy is supported for combined_accum variant
        if not hasattr(
                optimizer,
                "use_combined_accum") or not optimizer.use_combined_accum:
            if not is_compiled:
                # If the executable hasn't been compiled yet then it's ok to change
                # the reduction strategy.
                options._unfreeze()  # pylint: disable=protected-access
                options.Training.setMeanAccumulationAndReplicationReductionStrategy(  # pylint: disable=line-too-long
                    enums.MeanReductionStrategy.Running)
                options._freeze()  # pylint: disable=protected-access
            elif options.Training.meanAccumulationAndReplicationReductionStrategy != enums.MeanReductionStrategy.Running:  # pylint: disable=line-too-long
                raise ValueError(
                    "Invalid optimizer: the new optimizer would "
                    "require changing options.Training."
                    "meanAccumulationAndReplicationReductionStrategy to "
                    "poptorch.MeanReductionStrategy.Running but the "
                    "executable is already compiled.")

    # pylint: disable=protected-access
    auto_loss_scaling = options._Popart.options.get(
        "automaticLossScalingSettings.enabled", False)
    if variable_attrs and auto_loss_scaling:
        # Automatic loss scaling requires loss scaling to be variable
        variable_attrs.markAsVariable("loss_scaling")
    _AttrReader(
        attr_readers, "loss_scaling", _OptimizerGetter(1.0),
        _ValueConstPairFormatter(
            variable_attrs, lambda v: v == 1.0 and not auto_loss_scaling))
    # Group variables: per group, can change over time.
    #     source: opt.param_groups[i][name] / opt.defaults[name]
    #     format: {name: (value, is_const)}
    _AttrReader(attr_readers,
                "lr",
                _GroupGetter(),
                _ValueConstPairFormatter(variable_attrs, isNeverConst),
                new_name="learningRate")
    weight_decay_const_value = 0.0
    # In PyTorch AdamW has a different default value from Adam
    if optimizer_type == _OptimizerType.ADAMW:
        weight_decay_const_value = 1e-2

    _AttrReader(
        attr_readers, "weight_decay", _GroupGetter(),
        _ValueConstPairFormatter(variable_attrs,
                                 _IsEqualTo(weight_decay_const_value)))
    _AttrReader(attr_readers, "momentum", _GroupGetter(),
                _ValueConstPairFormatter(variable_attrs, _IsEqualTo(0.0)))
    _AttrReader(attr_readers, "velocity_scaling", _GroupGetter(1.0),
                _ValueConstPairFormatter(variable_attrs, _IsEqualTo(1.0)))
    _AttrReader(attr_readers, "dampening", _GroupGetter(),
                _ValueConstPairFormatter(variable_attrs, _IsEqualTo(0.0)))
    _AttrReader(attr_readers, "eps", _GroupGetter(),
                _ValueConstPairFormatter(variable_attrs, _IsEqualTo(1e-08)))
    _AttrReader(attr_readers, "max_weight_norm", _GroupGetter(),
                _ValueConstPairFormatter(variable_attrs, _IsEqualTo(65500.0)))
    _AttrReader(attr_readers, "alpha", _GroupGetter(),
                _ValueConstPairFormatter(variable_attrs, isAlwaysConst))
    _AttrReader(attr_readers, "nesterov", _GroupGetter(),
                _ValueConstPairFormatter(variable_attrs, isAlwaysConst))
    _BetaReader(attr_readers, variable_attrs)

    # Split the optimizer's attributes in one of the three categories:
    # - Group variables
    # - Optimizer variables
    # - Optimizer attributes
    #
    # The optimizer dictionary we send to the backend is structured like:
    # {
    #   "optimizer_type": type,
    #   "opt_attrs_0": value,
    #   ...
    #   "defaults": {
    #       "group_vars_0": (value, is_const),
    #       ...
    #       "opt_vars_0": (value, is_const),
    #       ...
    #   },
    #   "groups": [
    #       {
    #           "group_vars_0": (value, is_const),
    #           ...
    #       },
    #       ...
    #   ]
    # }
    group_vars = opt_class._group_vars  # pylint: disable=protected-access
    all_attrs = [
        attr for attr in opt_class._child_only if attr not in group_vars  # pylint: disable=protected-access
    ]
    opt_attrs = [
        attr for attr in all_attrs if attr not in opt_class._child_vars  # pylint: disable=protected-access
    ]
    opt_vars = [
        attr for attr in opt_class._child_only  # pylint: disable=protected-access
        if attr in opt_class._child_vars  # pylint: disable=protected-access
    ]

    def getOptimizerAttrNames(opt):
        # Remove attributes belonging to the upstream Optimizer
        exceptions = ["defaults", "state", "param_groups", "variable_attrs"]
        return [k for k in opt.__dict__.keys() if k not in exceptions]

    def getGroupAttrNames(group):
        # Remove attributes belonging to the upstream Optimizer
        exceptions = ["params"]
        return [k for k in group.keys() if k not in exceptions]

    opts = {"optimizer_type": optimizer_type}
    for attr in opt_attrs:
        opts.update(attr_readers[attr](optimizer))
    defaults = {}
    for attr in group_vars:
        defaults.update(attr_readers[attr](optimizer.defaults))
    attr_tracker.checkDefaultAttributes(list(optimizer.defaults.keys()))
    for attr in opt_vars:
        defaults.update(attr_readers[attr](optimizer))
    attr_tracker.checkOptimizerAttributes(getOptimizerAttrNames(optimizer))
    for i, g in enumerate(optimizer.param_groups):
        attr_tracker.checkGroupAttributes(getGroupAttrNames(g), i)

    opts["defaults"] = defaults

    # Create num_groups dictionaries
    opts["groups"] = []
    for index in range(0, num_groups):
        group = {}
        params = optimizer.param_groups[index]
        for attr in group_vars:
            group.update(attr_readers[attr](params))
        opts["groups"].append(group)

    logger.debug("Python optimizer %s", opts)
    # From now on print a message when encountering unknown attributes
    attr_tracker.enableChecks()
    return opts


class _OptimizerType(enum.IntEnum):
    SGD1 = 0
    SGD2 = 1
    ADAM = 2
    ADAMW = 3
    ADAMW_NO_BIAS = 4
    RMSPROP = 5
    RMSPROP_CENTERED = 6
    LAMB = 7
    LAMB_NO_BIAS = 8


def _toPoptorchClass(optimizer_type):
    assert isinstance(optimizer_type, _OptimizerType)
    if optimizer_type in [_OptimizerType.ADAMW, _OptimizerType.ADAMW_NO_BIAS]:
        return optim.AdamW
    if optimizer_type in [
            _OptimizerType.RMSPROP, _OptimizerType.RMSPROP_CENTERED
    ]:
        return optim.RMSprop
    if optimizer_type in [_OptimizerType.LAMB, _OptimizerType.LAMB_NO_BIAS]:
        return optim.LAMB
    if optimizer_type in [_OptimizerType.SGD1, _OptimizerType.SGD2]:
        return optim.SGD
    assert optimizer_type == _OptimizerType.ADAM, (
        "Unknown optimizer_type %s" % optimizer_type)
    return optim.Adam


# pylint: disable=too-many-return-statements
def _toPoptorchOptimizer(optimizer):
    # If an optimizer has anything other than torch.optim.Optimizer or
    # poptorch.optim.Optimizer as its parent classes, it may be an attempt to
    # implement a custom optimizer through subclassing
    if torch.optim.Optimizer not in optimizer.__class__.__bases__ and \
            optim.Optimizer not in optimizer.__class__.__bases__:
        logger.warning(
            "Optimizer `%s` subclassed from classes in poptorch.optim or "
            "torch.optim are unlikely to behave as intended. Poptorch does "
            "not run Python optimizer code directly; but instead uses IPU "
            "native optimisers implemented in PopART.",
            type(optimizer).__name__)

    if isinstance(optimizer, torch.optim.SGD):
        use_combined_accum = getattr(optimizer, "use_combined_accum", False)
        if use_combined_accum:
            return _OptimizerType.SGD1
        return _OptimizerType.SGD2

    if isinstance(optimizer, torch.optim.Adam):
        return _OptimizerType.ADAM

    if isinstance(optimizer, torch.optim.AdamW):
        if isinstance(optimizer, optim.AdamW):
            bias_correction = getattr(optimizer, "bias_correction", True)
            if not bias_correction:
                return _OptimizerType.ADAMW_NO_BIAS
        return _OptimizerType.ADAMW

    if isinstance(optimizer, torch.optim.RMSprop):
        centered = optimizer.param_groups[0]["centered"]
        for i, group in enumerate(optimizer.param_groups):
            assert group["centered"] == centered, (
                "All parameter groups must "
                "have the same value for the 'centered' attribute (Group 0: "
                f"{centered} / Group {i}: {group['centered']})")

        if centered:
            return _OptimizerType.RMSPROP_CENTERED
        return _OptimizerType.RMSPROP

    if isinstance(optimizer, optim.LAMB):
        bias_correction = getattr(optimizer, "bias_correction", True)
        if bias_correction:
            return _OptimizerType.LAMB
        return _OptimizerType.LAMB_NO_BIAS
    return None


def _toCamelCase(string):
    """Convert a snake case string (PyTorch) to camel case (PopART)"""
    words = string.split("_")
    return words[0] + "".join(w.capitalize() for w in words[1:])


class _GroupGetter:
    """Functor to access a parameter group attribute"""

    def __init__(self, default_value=None):
        self.default_value = default_value

    def __call__(self, group, name):
        assert isinstance(group, dict), (f"{name} must be stored in "
                                         "param_groups")
        value = group.get(name, self.default_value)
        assert value is not None, (f"Mandatory attribute {name} not found "
                                   "in optimizer group")
        return value


class _OptimizerGetter:
    """Functor to access an Optimizer attribute"""

    def __init__(self, default_value=None):
        self.default_value = default_value

    def __call__(self, opt, name):
        assert isinstance(opt, torch.optim.Optimizer), (
            f"{name} must be stored "
            "as an Optimizer attribute (Not in a group)")
        value = getattr(opt, name, self.default_value)
        assert value is not None, (f"Mandatory attribute {name} not found "
                                   "in optimizer attributes")
        return value


def _assertIsNumber(value, name):
    assert isinstance(value, numbers.Number), (f"Expected a number for {name}"
                                               f" but got {value} instead")


class _ValueConstPairFormatter:
    """Functor to format a value into a pair ``(value, is_const)`` where
    "is_const" is a boolean

    If ``variable_attrs`` is provided it will be used to determine the
    attribute's const-ness.

    Otherwise the ``const_evaluator`` function will be called.
    """

    def __init__(self, variable_attrs, const_evaluator, value_validator=None):
        assert variable_attrs is None or isinstance(variable_attrs,
                                                    optim.VariableAttributes)
        if value_validator is None:
            value_validator = _assertIsNumber
        self.value_validator = value_validator
        self.variable_attrs = variable_attrs
        self.const_evaluator = const_evaluator

    def __call__(self, value, name):
        self.value_validator(value, name)
        if self.variable_attrs:
            is_const = self.variable_attrs.isConstant(name)
        else:
            is_const = self.const_evaluator(value)
        return (value, is_const)


class _IsEqualTo:
    """Functor which returns True if the passed value is equal to the reference"""

    def __init__(self, reference):
        self.reference = reference

    def __call__(self, value):
        return value == self.reference


class _AttrReader:
    def __init__(self, readers, name, getter, formatter=None, new_name=None):
        if new_name is None:
            new_name = _toCamelCase(name)
        if formatter is None:
            formatter = lambda x, _: x

        self.name = name
        self.getter = getter
        self.new_name = new_name
        self.formatter = formatter

        # Register itself
        readers[name] = self

    def __call__(self, params):
        """Get the ``name`` attribute value from ``params`` (An ``optimizer`` or
           ``param_group``)
        - if ``name`` is not part of ``params`` then ``default_value`` will be
          used.
        - If no ``variable_attrs`` list and no const value are provided then
          only ``{name: value}`` will be returned.
        - if a ``variable_attrs`` object is provided then the parameter's
          const-ness will depend on whether or not it's marked as const.
        - if no list is provided but the parameter's value is equal to
          ``is_const_val`` then the parameter will be considered constant
        """
        value = self.getter(params, self.name)
        return {self.new_name: self.formatter(value, self.name)}


class _BetaReader(_AttrReader):
    def __init__(self, attr_readers, variable_attrs):
        def isAlwaysConst(_value):
            return True

        def assertIsFloatPair(value, name):
            assert isinstance(value, tuple) and len(value) == 2, (
                f"Expected a pair for {name}"
                f" but got {value} instead")
            _assertIsNumber(value[0], name + "[0]")
            _assertIsNumber(value[1], name + "[1]")

        super().__init__(
            attr_readers, "betas", _GroupGetter(),
            _ValueConstPairFormatter(variable_attrs, isAlwaysConst,
                                     assertIsFloatPair))

    def __call__(self, params):
        betas = super().__call__(params)["betas"]
        assert betas and isinstance(betas, tuple) and len(betas) == 2
        assert isinstance(betas[0], tuple) and len(
            betas[0]) == 2, ("'betas' group attribute must be a pair")
        return {
            "beta1": (betas[0][0], betas[1]),
            "beta2": (betas[0][1], betas[1])
        }


================================================
FILE: python/_options_config.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.

import os
import poptorch


def parseAndSetOptions(options, filepath):
    cmds = []
    with open(filepath) as f:
        filename = os.path.basename(f.name)
        prefix = "options."
        for line in f:
            # Remove whitespace
            stripped = line.strip()
            # Skip empty lines and comments
            if not stripped or stripped.startswith("#"):
                continue
            cmd = prefix + stripped
            cmds.append(cmd)

    code = "\n".join(cmds)
    try:
        # pylint: disable=exec-used
        exec(code, {}, {"poptorch": poptorch, "options": options})
    except SyntaxError as err:
        err_class = err.__class__.__name__
        detail = err.args[0]
        lineno = err.lineno
        line = err.text
        # pylint: disable=no-member
        raise poptorch.ConfigFileError("{} at line {} of {}: {}\n> {}".format(
            err_class, lineno, filename, detail, line))


================================================
FILE: python/_options_impl.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import abc
import copy
import logging
import datetime
import torch
import tqdm

from ._logging import logger

_begin_ipu_block = torch.ops.poptorch.begin_ipu_block

# Disable tqdm locks: this might cause some visual artifacts
# in the console but this will prevent dead locks in multiprocessing
# applications.
# https://github.com/tqdm/tqdm/issues/461#issuecomment-334343230
tqdm.tqdm.get_lock().locks = []


class ProgressBar:
    def __init__(self):
        self.compilation_time = None
        self._start_time = None
        self._bar = None
        self._last = 0

    def __call__(self, progress: int, total: int):
        if self._bar is None:
            self._start_time = datetime.datetime.now()
            # Remove {rate_fmt}{postfix} from the default format
            # as it doesn't really make sense for a compilation process
            #
            # Note: this is *not* a f-string
            bar_format = "{l_bar}{bar}| {n_fmt}/{total_fmt} "
            bar_format += "[{elapsed}<{remaining}]"
            self._bar = tqdm.tqdm(desc="Graph compilation",
                                  total=total,
                                  bar_format=bar_format)
        self._bar.update(progress - self._last)
        self._last = progress
        self.compilation_time = datetime.datetime.now() - self._start_time
        if progress == total:
            self._bar.close()
            self._bar = None
            self.compilation_time = datetime.datetime.now() - self._start_time


class OptionsDict:
    """Safe dictionary to store options: only keys which have been passed to
    the constructor can later be updated.
    """

    def __init__(self, **default_values):
        # Keep a dictionary of warnings messages based on the parameter options:
        # these are printed when the dictionaries are consolidated. The use of a
        # dictionary allows a warning to be removed by the key, e.g. if there is
        # a warning that the default parameter has changed but the parameter is
        # specified explicitly.
        self._warnings = {}

        # Allow warnings to be disabled by adding them to the list
        self._warnings_disabled = set()

        # Option object will be frozen after first use.
        self._is_frozen = False

        # _values must be the last attribute set in the __init__
        self._values = default_values

    def set(self, **kwargs):
        self.checkIsFrozen()
        for option, value in kwargs.items():
            assert self.exists(option), ("Invalid option %s, valid options"
                                         " are %s") % (option,
                                                       self._values.keys())
            assert isinstance(
                value, type(self._values[option])
            ), "Unexpected type %s for option %s. Expected %s" % (
                type(value), option, type(self._values[option]))
            self._values[option] = value

    def createOrSet(self, **kwargs):
        self.checkIsFrozen()
        for option, value in kwargs.items():
            if option in self._values:
                self.set(**{option: value})
            else:
                self._values[option] = value

    def exists(self, option):
        return option in self._values

    def deleteIfExists(self, option):
        if self.exists(option):
            del self._values[option]

    def _hasattr(self, option):
        if option == "__class__":
            return True
        if option.startswith("_"):
            return option in self.__getstate__().keys()
        return self.exists(option)

    # pylint: disable=protected-access
    def _changeFreezeState(self, new_state):
        self._is_frozen = new_state
        for _, value in self.__dict__.items():
            if isinstance(value, OptionsDict):
                if value._hasattr('_is_frozen'):
                    value._is_frozen = new_state
            else:
                if hasattr(value, '_is_frozen'):
                    value._is_frozen = new_state

    def _freeze(self):
        self._changeFreezeState(True)

    def _unfreeze(self):
        self._changeFreezeState(False)

    def checkIsFrozen(self, option=None):
        # Skip check during object initialization.
        if self._hasattr('_is_frozen'):
            if option != '_is_frozen' and self._is_frozen:
                raise AttributeError("Can't modify frozen Options")

    def __deepcopy__(self, memory):
        opts_class = self.__class__
        copied_options = opts_class.__new__(opts_class)
        memory[id(self)] = copied_options
        for key, val in self.__dict__.items():
            if key == '_is_frozen':
                val = False
            setattr(copied_options, key, copy.deepcopy(val, memory))
        return copied_options

    def __getstate__(self):
        return self.__dict__

    def __setstate__(self, state):
        self.__dict__.update(state)

    def __setattr__(self, option, value):
        # Private attributes are allowed, but should be set in the __init__ before _values
        # public ones must be declared in default_values.

        self.checkIsFrozen(option)
        if option.startswith("_"):
            # Option cannot be defined after _values definition.
            if self._hasattr('_values') and not self._hasattr(option):
                raise AttributeError(
                    f"Invalid private attribute {option}. "
                    f"Valid attributes: {list(self.__dict__.keys())}")
            super().__setattr__(option, value)
        else:
            self.set(**{option: value})

    def __getattr__(self, option):
        if not self._hasattr(option):
            raise AttributeError(f"Invalid attribute {option}.")
        if option.startswith("_"):
            return self.__getstate__()[option]
        return self._values[option]

    def update(self, other):
        for warning in self._warnings.values():
            if warning not in self._warnings_disabled:
                logger.warning(warning)

        assert not set(self._values.keys()).intersection(
            other), "Can't merge dictionaries, they have some keys in common"
        other.update(self._values)
        return other

    def toDict(self):
        return self.update({})

    def __call__(self, option):
        return getattr(self, option)

    def __repr__(self):
        # Call __repr__ on v so that strings display with quotes.
        return (f"{type(self).__name__}(" +
                ", ".join(f"{k}={v.__repr__()}"
                          for k, v in self._values.items()) + ", " +
                ", ".join(f"{k}={v.__repr__()}"
                          for k, v in self.__dict__.items()
                          if k != "_values") + ")")


default_source_location_excludes = [
    "install/poptorch", "site-packages/torch", "site-packages/poptorch"
]


class IStageManager(abc.ABC):
    def __init__(self):
        self._next_auto_id = 0
        self._current_ipu = None
        # We expect Torch to trace the graph 3 times, so to avoid printing
        # the same messages several times we store all the messages and
        # print the first third of them at the end.
        self._debug_messages = []

    def clearDebug(self):
        self._debug_messages = []

    def _debug(self, *args):
        if logger.isEnabledFor(logging.DEBUG):
            self._debug_messages.append(args)

    def printDebug(self):
        n = len(self._debug_messages)
        # We assume the graph was traced 3 times if:
        # - Number of messages can be divided by 3
        # - The first message is identical to the n/3th and 2n/3th ones.
        is_triple_trace = n > 0 and n % 3 == 0\
                and self._debug_messages[0] == self._debug_messages[n//3] \
                == self._debug_messages[2*n//3]
        if is_triple_trace:
            for i in range(n // 3):
                logger.debug(*self._debug_messages[i])
        else:
            # Not sure what happened: in doubt print everything
            for m in self._debug_messages:
                logger.debug(m)

    def nextAutoId(self):
        id = self._next_auto_id
        self._next_auto_id += 1
        return str(id)

    @abc.abstractmethod
    def getStage(self, block_id):
        """Return the stage corresponding to the given block_id.
        """

    def beginStage(self, user_id, ipu_id_from_block):
        user_id = user_id or self.nextAutoId()
        self._current_ipu = ipu_id_from_block
        stage = self.getStage(user_id)
        # If the user specified an ipu_id in the option use that one
        ipu = stage._ipu if stage._ipu is not None else ipu_id_from_block  # pylint: disable=protected-access
        if ipu is None:
            self._debug(
                "No IPU specified for block %s: default to stage_id %d",
                user_id, stage._stage_id)  # pylint: disable=protected-access
            ipu = stage._stage_id  # pylint: disable=protected-access
        self._debug("Starting block id=%s stage=%d phase=%d ipu=%d", user_id,
                    stage._stage_id, stage._phase_id, ipu)  # pylint: disable=protected-access
        _begin_ipu_block(stage._stage_id, stage._phase_id, ipu)  # pylint: disable=protected-access

    def resetAutoId(self):
        self._next_auto_id = 0


================================================
FILE: python/_poplar_executor.py
================================================
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.
import collections
import copy
from datetime import timedelta
import functools
import itertools
import os
import pickle
from typing import Callable, Dict, List, Optional
from types import MethodType
import weakref
import warnings
import torch

# Do not import any poptorch.* here: it will break the poptorch module
from . import _impl
from . import _utils
from . import _args_parser
from . import _optimizer_attributes
from . import enums
from . import _printing
from . import optim
from . import profiling
from . import poptorch_core  # type: ignore
from . import _poptorch_data
from ._utils import accessAttributes, flattenTensorStructure, reconstructTensorStructure, isOnIpu
from ._logging import logger
from .options import Options, PipelinedExecution, ShardedExecution
from .optim import Optimizer
NO_EXECUTABLE_ERR = "Model has not been compiled or has been destroyed."


# Hacky way to make sure tensors end up on the IPU rather than the CPU by default.
# Note: this is only needed for backward compatibility with tracing but we will
# eventually stop supporting this approach so make sure a warning is printed.
class _SetDefaultDeviceType:
    def __init__(self):
        self.overrides = dict()
        self.saved_distribution_validate_args = None

    def replace(self):
        def create_wrapper(f):
            @functools.wraps(f)
            def _wrapper(*args, **kwargs):
                if "device" not in kwargs:
                    logger.warning(
                        "No device set in torch.%s(): forcing to IPU",
                        f.__name__)
                    kwargs["device"] = "ipu"
                return f(*args, **kwargs)

            return _wrapper

        # All the ops with FACTORY_PARAMS in <torch>/tools/pyi/gen_pyi.py
        for name in ("arange", "empty", "full", "full_like", "linspace",
                     "logspace", "ones", "rand", "randint", "randn",
                     "randperm", "range", "tensor", "zeros", "zeros_like"):
            func = getattr(torch, name)

            self.overrides[name] = func
            setattr(torch, name, create_wrapper(func))

        def create_non_tensor_wrapper(f):
            @functools.wraps(f)
            def _wrapper(*args, **kwargs):
                if not any(
                        isinstance(a, torch.Tensor) for a in itertools.chain(
                            args, kwargs.values())) and "device" not in kwargs:
                    logger.warning(
                        "No device set in torch.%s(): forcing to IPU",
                        f.__name__)
                    kwargs["device"] = "ipu"
                return f(*args, **kwargs)

            return _wrapper

        # overloaded ops that take a device for some overloads
        for name in ["normal"]:
            func = getattr(torch, name)

            self.overrides[name] = func
            setattr(torch, name, create_non_tensor_wrapper(func))

        # Arguments validation forces the tensors to be compared onto the IPU
        # then the result is sent back to the CPU.
        # For example:
        # >>> if self._validate_args:
        # >>>    assert torch.lt(self.low, self.high).all()
        # pylint: disable=protected-access
        self.saved_distribution_validate_args = \
            torch.distributions.Distribution._validate_args
        torch.distributions.Distribution.set_default_validate_args(False)

    def restore(self):
        # Restore the real Torch functions
        for name, real in self.overrides.items():
            setattr(torch, name, real)

        torch.distributions.Distribution.set_default_validate_args(
            self.saved_distribution_validate_args)


class _OverwriteContextManager:

    _subsitution_manager_types = [_SetDefaultDeviceType]

    def __init__(self):
        self.substitution_managers = [
            manager_type() for manager_type in
            _OverwriteContextManager._subsitution_manager_types
        ]

    def __enter__(self):
        for overwriter in self.substitution_managers:
            overwriter.replace()

        return self

    def __exit__(self, exc_type, value, traceback):
        for overwriter in reversed(self.substitution_managers):
            overwriter.restore()

    @classmethod
    def registerSubsitutionManager(cls, type):
        if type not in cls._subsitution_manager_types:
            cls._subsitution_manager_types.append(type)


# pylint: disable=too-many-public-methods
class PoplarExecutor:
    """ This class should not be created directly but is a wrapper around
    the model that was passed into `inferenceModel` or `trainingModel`.
    It only has a few methods which can be used to interface with the IPU.
    """

    _precompile_hooks: Dict[int, Callable] = collections.OrderedDict()
    _postcompile_hooks: Dict[int, Callable] = collections.OrderedDict()

    # pylint: disable=too-many-statements
    def __init__(self,
                 model: 'torch.nn.Module',
                 options: Optional['poptorch.Options'],
                 training: bool,
                 poptorch_version: str,
                 optimizer: Optional['torch.optim.Optimizer'] = None,
                 user_model: Optional['torch.nn.Module'] = None):
        if options:
            if not isinstance(options, Options):
                raise _impl.createPoptorchError(
                    "Invalid type: 'options' is of "
                    f"type {type(options)} (Expected poptorch.Options)")
            # Prevent the user from modifying these options.
            options._freeze()
            options = options.clone()
        else:
            options = Options()

        # NB model is the one which gets called, which may have its own wrapping
        # such as to have a loss. user_model is the one which is transformed.

        self._user_model = user_model or model

        if training:
            self._attribute_tracker = \
                    _optimizer_attributes.OptimizerAttrTracker(
                        options)
            if options.defaultOutputMode():
                # In training it makes sense to see only the last result, by
                # default.
                options.outputMode(enums.OutputMode.Final)
            if not optimizer:
                optimizer = optim.SGD(self._user_model.parameters(), lr=0.01)
        else:
            if options.defaultOutputMode():
                # In inference it makes sense to see all the results, by default.
                options.outputMode(enums.OutputMode.All)

            if options.Training.gradient_accumulation != 1:
                err_msg = (
                    "You must set "
                    "poptorch.Options().Training.gradientAccumulation to 1 "
                    "or leave it as its default value (1) when running a "
                    "poptorch.inferenceModel().")

                is_pipelined = (isinstance(options._execution_strategy,
                                           PipelinedExecution)
                                and not isinstance(options._execution_strategy,
                                                   ShardedExecution))
                if is_pipelined:
                    err_msg += (" Use poptorch.Options().deviceIterations "
                                "to process a sufficient number of batches "
                                "each run for pipelined execution instead.")

                raise _impl.createPoptorchError(err_msg)

            assert options.Training.gradient_accumulation == 1, ()
            assert not optimizer, "Optimizer should be None for inference"
        self._model = model

        self._host_weights_version = 0
        self._poptorch_version = poptorch_version

        self._executable = None
        self._outputs_structure = None
        self._options = options
        # The args parser needs to be initialised before the model gets wrapped
        # otherwise we will not be able to retrieve the real arguments list
        self._args_parser = _args_parser.ArgsParser(model)
        # Inputs used to compile the executable
        self._executable_inputs = None
        self._anchor_memory = {}

        # any anchors with unspecified output mode should receive the output
        # mode used for graph outputs
        for _, anchor in options.anchored_tensors.items():
            if anchor[1]:
                anchor[2] = options.output_mode
                if anchor[2] == enums.OutputMode.EveryN:
                    anchor[3] = options.output_return_period

        self._optimizer = optimizer
        self._ipu_optimizer_is_dirty = False
        self._host_rng_state_is_dirty = False
        self._cached_rng_state = None
        if self._options.exists("random_seed"):
            self._cached_rng_state = [self._options.random_seed]

        self._dict_optimizer = {}
        self.per_replica_params = {}
        self._training = training
        self._dirty_host_weights = False
        self._trace = None
        self._is_attached = False
        self._profiling = profiling.Channel(
            "poptorch.trainingModel" if self.
            training else "poptorch.inferenceModel")
        self._profiling.instrument(self, "copyWeightsToHost",
                                   "copyWeightsToDevice", "setOptimizer",
                                   "compile", "destroy")

        if optimizer:
            self.setOptimizer(optimizer)
        self._options._freeze()

        if self._training:
            # We don't want the pytorch model to keep the PopTorch one
            # alive so only keep a weak reference.
            parent = weakref.ref(self)

            class PoptorchModel(type(self._user_model)):
                def copyWeightsToHostIfNeeded(self):
                    """ Return True if the weights on the host were dirty and
                    have been updated.
                    Return False if the weights were already up to date.
                    """
                    p = parent()
                    if p is not None:
                        return p.copyWeightsToHostIfNeeded()
                    return False

                def destroy(self):
                    """Destroy the model: release the IPUs and the executable.
                    """
                    p = parent()
                    if p is not None:
                        p.destroy()

                def __getattribute__(self, name):
                    if name == "_host_weights_version":
                        p = parent()
                        if p is None:
                            return None
                        return p._host_weights_version
                    if name in ("_buffers", "_parameters", "forward"):
                        self.copyWeightsToHostIfNeeded()
                    return object.__getattribute__(self, name)

                def __getattr__(self, name):
                    attribute = super().__getattr__(name)
                    if isinstance(attribute, torch.nn.parameter.Parameter):
                        self.copyWeightsToHostIfNeeded()
                    return attribute

                def state_dict(self,
                               *args,
                               destination=None,
                               prefix="",
                               keep_vars=False):
                    """Return a shallow copy of the wrapped model's state dictionary.

                    Note: all the elements in the state dictionary are
                    unwrapped which means the state can be reloaded in an
                    environment where PopTorch is not installed.
                    """
                    out = collections.OrderedDict()
                    out_cache = {}

                    for k, v in super().state_dict(*args, destination, prefix,
                                                   keep_vars).items():
                        v_id = id(v)

                        # If the value occurs more than once, avoid multiple
                        # copies.
                        if v_id in out_cache:
                            out[k] = out_cache[v_id]
                        else:
                            # If the object is wrapped then the shallow copy will
                            # call _impl._pickleUnwrapObject and the new object will be in
                            # the wrapped registry.
                            # Unwrap the object if needed.
                            v_copy = _impl.unwrapIfWrapped(copy.copy(v))
                            out[k] = v_copy
                            out_cache[v_id] = v_copy

                    return out

            _utils.assert_signatures_match(PoptorchModel.state_dict,
                                           torch.nn.Module.state_dict)

            # The mere existence of the "__torch_function__" results in a
            # "__getattribute__" call and hence weight copying if required.
            # "check_has_torch_function" and "handle_torch_function_getter"
            # in the PyTorch source code may explain this.
            # Without this, the weights will not be copied in certain
            # situations such as torch.equal(a, b).
            class PoptorchParameter(torch.nn.Parameter):
                def __getattribute__(self, name):
                    p = parent()
                    if p is not None:
                        p.copyWeightsToHostIfNeeded()

                    return object.__getattribute__(self, name)

                @classmethod
                def __torch_function__(cls, func, types, args=(), kwargs=None):
                    if kwargs is None:
                        kwargs = {}
                    return super().__torch_function__(func, types, args,
                                                      kwargs)

            self.PoptorchParameter = PoptorchParameter

            class PoptorchBuffer(torch.Tensor):
                def __getattribute__(self, name):
                    p = parent()
                    if p is not None:
                        p.copyWeightsToHostIfNeeded()

                    return super().__getattribute__(name)

                @classmethod
                def __torch_function__(cls, func, types, args=(), kwargs=None):
                    if kwargs is None:
                        kwargs = {}
                    return super().__torch_function__(func, types, args,
                                                      kwargs)

            self.PoptorchBuffer = PoptorchBuffer
            self._install_state_hooks()

            # __getattr__ and __getattribute__ are attributes, not methods,
            # unfortunately we cannot just replace them in the model object: we
            # have to create a wrapper class
            # and change the object's class.
            PoptorchModel.__name__ = "Poptorch%s" % type(
                self._user_model).__name__
            self._user_model.__class__ = PoptorchModel

            # Register the wrapper types so that custom functions to
            # copy / serialize wrapped objects are set up.
            _impl.registerWrapperType(PoptorchModel)
            _impl.registerWrapperType(PoptorchParameter)
            _impl.registerWrapperType(PoptorchBuffer)

    def _install_state_hooks(self):
        for p in self._user_model.parameters():
            p.__class__ = self.PoptorchParameter
        for b in self._user_model.buffers():
            if not b.__class__ in (torch.Tensor, self.PoptorchBuffer):
                raise _impl.createPoptorchError(
                    "All buffers must be an instance of torch.Tensor "
                    f"(Got {type(b)})")
            b.__class__ = self.PoptorchBuffer

    def _update_optimizer_if_needed(self):
        if not self.isCompiled():
            raise _impl.createPoptorchError(NO_EXECUTABLE_ERR)
        if self._ipu_optimizer_is_dirty:
            poptorch_core.updateOptimizers(self._executable,
                                           self._dict_optimizer)
            self._ipu_optimizer_is_dirty = False

    def _read_optim_state_dict_if_needed(self):
        if not isinstance(self._optimizer, Optimizer):
            return
        if self._optimizer.host_state_is_dirty:
            if not self.isCompiled():
                raise _impl.createPoptorchError(NO_EXECUTABLE_ERR)
            assert not self._ipu_optimizer_is_dirty, (
                "Both host "
                "and ipu states cannot be dirty at the same time.")

            # We need to return both the internal state dict and torch's
            # state dict so that LR schedulers work
            self._optimizer.set_state_dict({
                **poptorch_core.readOptimizerState(self._executable),
                **torch.optim.Optimizer.state_dict(self._optimizer)
            })
            # Don't trigger a copy to IPU as we've just synced.
            self._optimizer.ipu_state_is_dirty = False
        else:
            logger.debug("Using cached optimiser state dict")

    def _on_device_attach(self):
        """Method called every time we attach to a device."""
        # Upload the weights to the IPU
        self.copyWeightsToDevice()
        # Upload the optimizer parameters
        if self._optimizer:
            self._update_optimizer_if_needed()
        # If the optimizer has a state: restore it.
        if self._optimizer and isinstance(self._optimizer, Optimizer):
            # If the optimiser has state to be written (from a checkpoint),
            # write it immediately after compilation
            if self._optimizer.has_state():
                self._optimizer.ipu_state_is_dirty = True
                self._write_optim_state_dict_if_needed()
            else:
                self._optimizer.host_state_is_dirty = True
                self._optimizer.ipu_state_is_dirty = False
        if self._cached_rng_state is not None:
            self._copyRngStateToDevice()

    def _get_optim_state_dict(self):
        assert isinstance(self._optimizer, Optimizer)
        self._read_optim_state_dict_if_needed()
        return self._optimizer.get_state_dict()

    def _write_optim_state_dict_if_needed(self):
        if not self.isCompiled():
            raise _impl.createPoptorchError(NO_EXECUTABLE_ERR)
        # If the new optimiser already has state (i.e. from a checkpoint), write it
        # to device
        if isinstance(self._optimizer,
                      Optimizer) and self._optimizer.ipu_state_is_dirty:
            assert not self._optimizer.host_state_is_dirty, (
                "Both host "
                "and ipu states cannot be dirty at the same time.")
            if self._optimizer.has_state():
                # Sync the weights to host first because writeOptimizerState() is
                # going to write both the weights and the optimizer state
                self.copyWeightsToHostIfNeeded()

                poptorch_core.writeOptimizerState(self._executable,
                                                  self._optimizer.state_dict())
            self._optimizer.ipu_state_is_dirty = False

    def load_state_dict(self,
                        state_dict: Dict[str, 'torch.Tensor'],
                        strict: bool = True):
        """Will call ``load_state_dict()`` on the wrapped model
        and automatically synchronise the weights with the IPU.

        Returns:
            ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
                * **missing_keys** is a list of str containing the missing keys
                * **unexpected_keys** is a list of str containing the
                    unexpected keys
        """
        out = self._user_model.load_state_dict(state_dict, strict)
        if self.isAttachedToDevice():
            logger.debug("load_state_dict: implicit copyWeightsToDevice()")
            self.copyWeightsToDevice()
        return out

    def __repr__(self):
        # We've created out repr function to provide info on BeginBlock
        return _printing.module_repr(self._user_model)

    def __getattr__(self, attr):
        model_attr = getattr(self._user_model, attr)
        # We apply this wrapper here rather than adding it to PoptorchParameter
        # for two reasons:
        # 1) We might supply the same model to multiple PopTorch wrappers
        #    (particularly we might supply it to trainingModel() and then
        #     to inferenceModel()), and we need to be able to distinguish
        #     between replicaGrouping() calls on each wrapper.
        # 2) We don't wrap inference parameters in PoptorchParameter normally,
        #    but we might want to use replicaGrouping() with inference models.
        #    If we do start doing PoptorchParameter wraps on inference models,
        #    we'd end up pointlessly copying weights back from the device.
        if isinstance(model_attr, torch.nn.Parameter):
            model = self

            class ReplicaGroupingWrapper:
                def replicaGrouping(
                        self, comm_group_type: enums.CommGroupType,
                        shards: int,
                        variable_retrieval_mode: enums.VariableRetrievalMode):
                    model.per_replica_params[attr] = (comm_group_type, shards,
                                                      variable_retrieval_mode)

                def __getattr__(self, attr):
                    if attr == "replicaGrouping":
                        return self.replicaGrouping
                    return getattr(model_attr, attr)

            return ReplicaGroupingWrapper()
        return model_attr

    @property
    def model(self) -> 'torch.nn.Module':
        """Access the wrapped Torch model."""
        return self._user_model

    @property
    def options(self) -> 'poptorch.Options':
        """Access to the options.

        .. seealso:: :py:class:`~poptorch.Options`"""
        return self._options

    def _debugGetPopartIR(self) -> str:
        return poptorch_core._getPopartIR(self._executable)  # pylint: disable=protected-access

    def getTensorNames(self) -> List[str]:
        """Returns a list of all tensor names within the computational
        graph. Model must be compiled in advance.
        """

        assert self._executable is not None, "Model must be compiled " \
            "before calling getTensorNames"

        tensor_names = poptorch_core._getTensorNames(self._executable)  # pylint: disable=protected-access

        return list(tensor_names)

    def getAnchoredTensor(self, short_name: str) -> torch.Tensor:
        assert short_name in self._anchor_memory, \
            "No tensor with name " + short_name + " found."
        return self._anchor_memory[short_name]

    def copyWeightsToHostIfNeeded(self) -> bool:
        """ Return True if the weights on the host were dirty and
        have been updated.
        Return False if the weights were already up to date.
        """
        if self._dirty_host_weights:
            logger.debug("Implicit copyWeightsToHost()")
            self.copyWeightsToHost()
            return True
        return False

    # Copy weights from the device into the memory of the model given on wrapper creation.
    def copyWeightsToHost(self) -> None:
        """ Updates the parameters used in `model` with the weights stored on device.
        (The weights in ``model.parameters()``)
        """

        if not self.isCompiled():
            raise _impl.createPoptorchError(NO_EXECUTABLE_ERR)

        # Don't trigger another copyToHost by accessing `named_parameters`
        self._dirty_host_weights = False

        weights = {
            **dict(self._model.named_parameters()),
            **dict(self._model.named_buffers())
        }
        poptorch_core.copyWeightsToHost_impl(self._executable,
                                             tuple(weights.keys()),
                                             tuple(weights.values()))
        self._host_weights_version += 1

    # Write from host memory to IPU memory. This is done automatically on
    # compilation so should be rarely used.
    def copyWeightsToDevice(self) -> None:
        """Copies the weights from ``model.parameters()`` to the IPU device.
        Implicitly called on first call.
        """
        if not self.isCompiled():
            raise _impl.createPoptorchError(NO_EXECUTABLE_ERR)

        # Don't trigger a copyToHost by accessing `named_parameters`
        self._dirty_host_weights = False

        # Trigger a IPU sync -> host if needed for
        # the optimizer state.
        if self._optimizer:
            self._optimizer.state_dict()

        weights = {
            **dict(self._model.named_parameters()),
            **dict(self._model.named_buffers())
        }
        poptorch_core.copyWeightsToDevice_impl(self._executable,
                                               tuple(weights.keys()),
                                               tuple(weights.values()))

    def copyNamedBuffersToDevice(self) -> None:
        """Copies the buffers from ``model.parameters()`` to the IPU device.
        """
        if not self.isCompiled():
            raise _impl.createPoptorchError(NO_EXECUTABLE_ERR)

        # pylint: disable=protected-access
        if 'updatableNamedBuffers' not in self._options._Popart.options:
            raise _impl.createPoptorchError(
                "No named buffers marked as updatable via "
                "updatableNamedBuffers option")

        # Don't trigger a copyToHost by accessing `named_parameters`
        self._dirty_host_weights = False

        # Trigger a IPU sync -> host if needed for
        # the optimizer state.
        if self._optimizer:
            self._optimizer.state_dict()

        buffers = {**dict(self._model.named_buffers())}
        # pylint: disable=protected-access
        updatable_buffers = \
            self._options._Popart.options['updatableNamedBuffers']
        updatable_buffer_pointers = tuple(buffers[b]
                                          for b in updatable_buffers)

        poptorch_core.copyNamedBuffersToDevice_impl(self._executable,
                                                    tuple(updatable_buffers),
                                                    updatable_buffer_pointers)

    def setOptimizer(self, optimizer: 'torch.optim.Optimizer'):
        """Sets the optimiser for a training model. Will overwrite the
        previous one. Supported optimisers: ``optim.SGD``, ``optim.Adam``,
        ``optim.AdamW``, ``optim.RMSProp``, ``optim.LAMB``.
        """
        # Optimiser state functions require a compiled executable
        if self.isCompiled() and optimizer != self._optimizer:
            # If we're setting a new optimiser, make sure the internal state of the old
            # optimiser has been read back so it's not lost, and then detach the old
            # optimiser so that its subsequent state_dict/load_state_dict calls don't
            # trigger optimiser state read/writes anymore
            if self._optimizer and isinstance(self._optimizer, Optimizer):
                self._read_optim_state_dict_if_needed()
                self._optimizer.state_dict = \
                self._optimizer.get_state_dict
            # We only want to update the state on the IPU if it's a brand new optimizer
            # (Not if the params of the existing one have changed).
            if isinstance(optimizer, Optimizer):
                optimizer.ipu_state_is_dirty = True

        # If it's a PopTorch optimizer: instrument the state_dict() method
        # to implicitly transfer the state back to the host.
        if isinstance(optimizer, Optimizer):
            optimizer.state_dict = MethodType(
                PoplarExecutor._get_optim_state_dict, self)

        self._optimizer = optimizer
        dict_optimizer = _optimizer_attributes.convertOptimizerToDict(
            optimizer, self._attribute_tracker, self._options,
            self.isCompiled())

        if dict_optimizer != self._dict_optimizer:
            self._dict_optimizer = dict_optimizer
            self._ipu_optimizer_is_dirty = True

        # If we need and can update the optimizer now: do it.
        if self.isAttachedToDevice():
            self._update_optimizer_if_needed()
            self._write_optim_state_dict_if_needed()

    def _get_module_and_name(self, n):
        """
        Given a nested attribute path, return `(module, name)` such that
        `module` is the object which contains the attribute `name`, relative to
        `self._model`.

        This makes it easy to access nested attributes with
        `getattr` and `setattr`, using the argument splat `*a` operator, i.e.:

        ```
        getattr(*self._get_module_and_name("some_module.layer_one.weight"))
        ```

        gets the attribute `self._model.some_module.layer_one.weight`.
        """
        m = self._model
        name = n
        sn = n.rpartition(".")
        if sn[1] == ".":
            m = m.get_submodule(sn[0])
            name = sn[2]
        return m, name

    @_impl.destroyDispatcherOnExit
    def _compileWithDispatch(self, in_tensors, executable_filename=None):
        with _OverwriteContextManager():
            module_namescope = None
            if self.options._module_namescope_enabled:  # pylint: disable=protected-access
                module_namescope = _impl.NameScopeHook(self._model)

            tensor_args = flattenTensorStructure(
                (in_tensors.args, in_tensors.kwargs))
            mlir_compiler_options = poptorch_core.CompilerOptions()
            mlir_compiler_options.source_location_excludes = self._options._source_location_excludes  # pylint: disable=line-too-long, protected-access

            dispatch_failed = False
            try:  # pylint: disable=too-many-nested-blocks
                # Create the graph. Future captured calls will be written into this
                # graph behind the scenes.
                poptorch_core.createGraph(
                    poptorch_core.TracingMode(
                        poptorch_core.TracingMode.PopART), tensor_args,
                    mlir_compiler_options)

                # Move the model parameters to the ipu and take a copy to load the
                # originals back once this has finished
                cpu_params = dict(self._model.named_parameters())
                cpu_buffers = dict(self._model.named_buffers())
                cpu_state = self._model.state_dict(keep_vars=True)
                # We need to remove the PoptorchBuffer and PoptorchParam annotations
                # before compiling the model. In addition, we must unwrap the whole
                # model to prevent IPU to CPU copies when accessing the state_dict.
                _impl.unwrapModelIfNecessary(self._model)
                if self.per_replica_params is not None:
                    for name, param in cpu_params.items():
                        if name in self.per_replica_params:
                            if param.shape == torch.Size([]):
                                raise _impl.createPoptorchError(
                                    "Scalars cannot be passed as per-replica "
                                    "weight tensor values")
                            param_tensor = param.narrow(0, 0, 1).squeeze(dim=0)
                            setattr(
                                *self._get_module_and_name(name),
                                torch.nn.Parameter(
                                    param_tensor,
                                    requires_grad=param.requires_grad))
                d = torch.device("ipu:0")
                poptorch_core.startParametersMove()
                self._model.to(d)
                poptorch_core.endParametersMove()

                # If there were any parameters and buffers (tensors), which were
                # aliases on the CPU (shared the same Python ID), these will have
                # become separate IPU tensors during the copy to IPU
                #
                # Find all such tensors, and then
                # 1. Keep a map from them to the earliest cpu tensor in the
                #    cpu_state dict.
                # 2. Replace IPU tensors which are not but should be aliases with
                #    that matching the earliest.
                # NB the "original" name is based on order of addition of the
                # tensors/modules and may not be a name of the parmeter which
                # replaced another, e.g. the case of "weight tying", but the
                # name of the "replaced". However, no names will be lost but the
                # aliases simply harmonised to be matching tensors on CPU and IPU.
                state = self._model.state_dict(keep_vars=True)
                tensors = collections.defaultdict(list)
                for name, tensor in cpu_state.items():
                    tensors[id(tensor)].append(name)
                # A map of parameters and buffers (tensors) on the CPU which share
                # the same python id, to the earliest tensor.
                cpu_aliases = {}

                aliases = [v for v in tensors.values() if len(v) > 1]
                for a in aliases:
                    # NB original matches that in model.named_x() as both this as
                    # model.state_dict() loop he same  OrderedDicts in same order
                    # and the named versions return only the first instances
                    original = a[0]

                    for other in a[1:]:
                        setattr(*self._get_module_and_name(other),
                                state[original])
                        cpu_aliases[other] = original

                # Map named unique parameters and buffers on the IPU.
                params = dict(self._model.named_parameters())

                poptorch_core.mapParamsToNames(tuple(params.keys()),
                                               tuple(params.values()))

                buffers = dict(self._model.named_buffers())

                poptorch_core.mapParamsToNames(tuple(buffers.keys()),
                                               tuple(buffers.values()))

                old_addresses = _impl.getBufferAndParameterAddresses(
                    self._model)

                if self.per_replica_params is not None:
                    for name, param in cpu_params.items():
                        if name in self.per_replica_params:
                            poptorch_core.setPerReplica(
                                name, param, *self.per_replica_params[name])

                poptorch_core.startDispatch()
                _impl.setDispatchTracing(True)
                _impl.setIpuContext(True)

                for _, hook in PoplarExecutor._precompile_hooks.items():
                    hook()

                self._options._execution_strategy.onStartTracing()  # pylint: disable=protected-access

                # The optimizer was created using the CPU model, therefore it points
                # at CPU tensors.  We need to remap those to IPU tensors.
                # We just moved '_model' to the IPU, therefore we need to join the
                # two maps and then remap the parameters from the optimizer.
                # From:
                #
                # cpu_tensors[name] = cpu_data_ptr
                # ipu_tensors[name] = ipu_tensor
                #
                # we build:
                #
                # cpu_to_ipu[cpu_data_ptr] = ipu_tensor
                #
                # And then remap all the tensors from group["params"]
                if self._training:
                    cpu_tensors = {
                        **cpu_buffers,
                        **cpu_params,
                    }
                    ipu_tensors = _impl.getBufferAndParameterTensors(
                        self._model)
                    cpu_to_ipu = {
                        cpu_tensors[n].data_ptr(): ipu
                        for n, ipu in ipu_tensors.items()
                    }
                    for index, group in enumerate(
                            self._optimizer.param_groups):
                        torch.ops.poptorch.optimizer_group(
                            index, [
                                cpu_to_ipu[cpu.data_ptr()]
                                for cpu in group["params"]
                            ])

                for idx, t in enumerate(tensor_args):
                    if t.requires_grad:
                        raise _impl.createPoptorchError(
                            "An input tensor to an IPU model can not have "
                            f"requires_grad set to True, however input {idx} "
                            f"does: {t}\nYou can set requires_grad=True from "
                            "within the model as an alternative, and return "
                            "gradients as outputs to your model, if required.")

                d = torch.device("ipu:0")
                # Move all the inputs to the IPU
                tensor_args = [t.to(d) for t in tensor_args]
                # Re-inject moved tensors in args and kwargs:
                args, kwargs = reconstructTensorStructure(
                    (in_tensors.args, in_tensors.kwargs), tensor_args)

                result = self._model(*args, **kwargs)
                if result is not None:
                    self._outputs_structure = result
                    output = flattenTensorStructure(result)

                    for x in output:
                        if not isOnIpu(x):
                            warnings.warn(
                                "Output expected to be on the IPU but is on %s"
                                % x.device.type)

                    output = [
                        out.int()
                        if out.dtype == torch.long and isOnIpu(out) else out
                        for out in output
                    ]
                    output = [
                        out.float()
                        if out.dtype == torch.double and isOnIpu(out) else out
                        for out in output
                    ]
                    poptorch_core.startOutputsMove()
                    output = [out.cpu() for out in output]
                    poptorch_core.endOutputsMove()

                poptorch_core.finalizeGraph()
            except:
                dispatch_failed = True
                raise
            finally:
                self._options._execution_strategy.onEndTracing()  # pylint: disable=protected-access

                for _, hook in PoplarExecutor._postcompile_hooks.items():
                    hook()

                _impl.setIpuContext(False)
                _impl.setDispatchTracing(False)
                # Turn off the dispatcher.
                poptorch_core.endDispatch(dispatch_failed)

                # Reload the cpu model state
                # Get the buffer and parameter addresses after the model has ran
                # but before resetting the model back to the cpu
                new_addresses = _impl.getBufferAndParameterAddresses(
                    self._model)

                def _set_param(k, v):
                    setattr(*self._get_module_and_name(k), cpu_params[v])

                for k in cpu_params:
                    cpu_params[k].__class__ = torch.nn.Parameter
                    _set_param(k, k)

                # Restore aliased parameters/buffers which will not be represented
                # in cpu_params or cpu_buffers
                for k, v in cpu_aliases.items():
                    _set_param(k, v)

                for k in cpu_buffers:
                    setattr(*self._get_module_and_name(k), cpu_buffers[k])

                # Re-install the Poptorch annotations for buffers and parameters
                _impl.rewrapModelIfNecessary(self._model)

                # Check that the buffer and parameter addresses haven't been changed
                # in the model
                # Note: this is done after resetting the model back to the cpu so
                # that errors thrown by this don't stop the model being in a valid
                # state
                _impl.errorOnBufferOrParameterAddressChanges(
                    old_addresses, new_addresses)

                if module_namescope is not None:
                    module_namescope.remove()

            # We only reach this point if dispatch didn't fail
            if executable_filename is not None:
                # Compile the captured graph using PopART.
                executable = poptorch_core.processDispatchAndImportExecutable(
                    self._options.toDict(), accessAttributes, self._training,
                    self._dict_optimizer,
                    list(self._options.anchored_tensors.values()),
                    executable_filename)
            else:
                # Compile the captured graph using PopART.
                executable = poptorch_core.compileWithManualTracing(
                    self._options.toDict(), accessAttributes, self._training,
                    self._dict_optimizer,
                    list(self._options.anchored_tensors.values()))

        return executable

    @_impl.traceMethod("modelCompilation")
    def _compile(self, in_tensors):
        """On POD we want to separate compilation from device
        initialisation because we want only one process to compile the model,
        but ``loadEngineAndConnectStreams()`` must happen at the same time in
        all the processes (Because they need to talk to each other during the
        initialisation process).

        This is achieved by calling the equivalent of ``compileAndExport()``
        from one of the processes: this will populate the PopART cache with
        the executable. (We use a temp file because we don't need the result,
        we just want the executable to be added to the cache).

        The caller will then call the regular ``_compile()`` method in all the
        processes at the same time and they should all hit the cache.
        """
        # Compile the poplar executable based on the batchsize.
        in_tensors_trace_view = self._preprocessGraph(in_tensors)

        # Note: in single process execution or if the cache is disabled
        # should_compile will always be False.
        with _impl.distributedCacheLock(self._model,
                                        self._options) as should_compile:
            # Only the first process should compile
            if should_compile:
                self._executable = self._compileWithDispatch(
                    in_tensors_trace_view)

        # In distributed execution mode:
        # At that point only the first process will have a compiled executable:
        # trigger the compilation process in all the other processes.
        if not self.isCompiled():
            self._executable = self._compileWithDispatch(in_tensors_trace_view)

        # Load the engine and connect the streams in all the processes.
        #
        # Note: no sync point was added because we expect the above
        # compileWithDispatch call to be quick as all the processes should
        # hit the cache.
        #
        # If the cache is disabled then we expect the compilation process
        # to roughly take the same amount of time in all processes.
        #
        # Note: if multiple processes run on the same host, it's recommended
        # to enable executable caching to avoid out of memory issues due
        # to concurrent compilation processes.
        if self._options.connection_type != enums.ConnectionType.Never:
            poptorch_core.loadEngineAndConnectStreams(self._executable)

        self._is_attached = self.isAttachedToDevice()

        # PopTorch might have attached to a device either during
        # compileWithDispatch (if connection type is set to Always) or
        # during loadEngineAndConnectStreams (if OnDemand is used),
        # either way this will have occurred in the C++ backend, *not* using
        # PoplarExecutor.attachToDevice(), therefore we need to manually
        # call the _on_device_attach() trigger here.
        if self._is_attached:
            self._on_device_attach()

    @_impl.traceMethod("graphPreprocessing")
    def _preprocessGraph(self, in_tensors):
        self._executable_inputs = in_tensors.clone()
        in_tensors_trace_view = in_tensors.clone()

        def remove_requires_grad(tensor):
            if not isinstance(tensor, torch.Tensor):
                return tensor
            if tensor.requires_grad:
                tensor = tensor.detach()
                logger.warning(
                    "Input tensor has requires_grad=True set. "
                    "This tensor will be detached because backward pass via "
                    "inputs is not supported.")
            return tensor

        in_tensors_trace_view.forEach(self._narrow_tensor)
        in_tensors_trace_view.forEach(remove_requires_grad)
        return in_tensors_trace_view

    def compile(self, *args, **kwargs) -> None:
        """Takes the same arguments as the wrapped PyTorch `model.__call__`.

        Trace and compile the wrapped model if no executable has been
        created yet.

        Note: The executable created by this method can only be executed,
        it cannot be exported to file.
        To precompile and save to file use
        :py:meth:`~poptorch.PoplarExecutor.compileAndExport`
        """
        in_tensors = self._args_parser(args, kwargs, False)
        if self._executable is not None:
            logger.warning(
                "Call to compile() ignored: the executable is already compiled"
            )
        else:
            self._compile(in_tensors)

    @_impl.traceMethod("loadExecutable")
    def loadExecutable(self, filename: str) -> None:
        """Load an executable previously generated using
        :py:meth:`~poptorch.PoplarExecutor.compileAndExport`
        """
        serialized_data = poptorch_core.importPoptorchMetadataFromFile(
            filename)

        try:
            data = _poptorch_data.parse(serialized_data,
                                        self._poptorch_version)
        except AssertionError as e:
            raise _impl.createPoptorchError("Invalid file %s: %s" %
                                            (filename, e))

        in_tensors_trace_view = self._preprocessGraph(data.executable_inputs)
        self._executable = self._compileWithDispatch(
            in_tensors_trace_view, executable_filename=filename)

        self._is_attached = self.isAttachedToDevice()

        if self._is_attached:
            self._on_device_attach()

    def save(self,
             filename: str,
             export_model: bool = True,
             save_rng_state: bool = True):
        """Save the compiled model to file.

        :param filename: Where to save the compiled executable.
        :param export_model: If `True` the Torch model will be saved in
            the file alongside the executable. :py:func:`~poptorch.load` can
            be used to restore both the original Torch model, the PopTorch
            model and the executable.
            If `False` then only the executable will be exported and it will
            be the user's responsibility to call
            :py:func:`~poptorch.inferenceModel` or
            :py:func:`~poptorch.trainingModel` to re-create the PopTorch model
            before calling :py:meth:`~poptorch.PoplarExecutor.loadExecutable`
            to restore the executable.
        :param save_rng_state: If `True` the random number generator's state
            and seed will be saved in the file alongside the executable.
        """
        if not self.isCompiled():
            raise _impl.createPoptorchError(NO_EXECUTABLE_ERR)
        dst_dir = os.path.dirname(filename)
        if dst_dir:
            if os.path.exists(dst_dir):
                assert os.path.isdir(dst_dir), ("Destination folder {dst_dir} "
                                                "is not a directory")
            else:
                os.makedirs(dst_dir)
        if os.path.isdir(filename):
            dirname = filename
            filename = os.path.join(dirname, "model.poptorch")
            logger.warning("save(): %s is a directory, saving model to %s",
                           dirname, filename)

        data = _poptorch_data.PoptorchData(self._poptorch_version,
                                           self._executable_inputs,
                                           self._options)
        if export_model:
            data.training = self._training
            data.model = self.model
            data.optimizer = self._optimizer

        if save_rng_state:
            data.rng_state = self.rng_state

        serialized_data = pickle.dumps(data, protocol=4)

        with self._profiling.tracepoint("saveExecutableToFile"):
            poptorch_core.saveExecutableToFile(self._executable, filename)
            poptorch_core.appendPoptorchMetadataToFile(serialized_data,
                                                       filename)

    @property
    def rng_state(self) -> List[int]:
        """Return the random number generator's seed & state of
        the compiled model."""
        if not self.isCompiled():
            raise _impl.createPoptorchError(NO_EXECUTABLE_ERR)
        if self._host_rng_state_is_dirty:
            self._host_rng_state_is_dirty = False
            self._cached_rng_state = [
                poptorch_core.getRandomSeed(self._executable)
            ] + poptorch_core.getRngState(self._executable)
        return self._cached_rng_state

    @rng_state.setter
    def rng_state(self, state: List[int]):
        """Set the random number generator's seed & state for the compiled
        model."""
        if not self.isCompiled():
            raise _impl.createPoptorchError(NO_EXECUTABLE_ERR)
        self._host_rng_state_is_dirty = False
        self._cached_rng_state = state.copy()
        if self.isAttachedToDevice():
            self._copyRngStateToDevice()

    def _copyRngStateToDevice(self):
        poptorch_core.setRngState(self._executable, self._cached_rng_state[0],
                                  self._cached_rng_state[1:])

    @_impl.traceMethod("compileAndExport")
    def compileAndExport(self,
                         filename: str,
                         *args: List['torch.Tensor'],
                         export_model: bool = True,
                         **kwargs: Dict[str, 'torch.Tensor']):
        """Precompile an executable and save it to file.

        ``args`` and ``kwargs`` are the same arguments as the wrapped PyTorch
        ``model.__call__``

        :param filename: Where to save the compiled executable.
        :param export_model: If `True` the Torch model will be saved in
            the file alongside the executable. :py:func:`~poptorch.load` can
            be used to restore both the original Torch model, the PopTorch
            model and the executable.
            If `False` then only the executable will be exported and it will
            be the user's responsibility to call
            :py:func:`~poptorch.inferenceModel` or
            :py:func:`~poptorch.trainingModel` to re-create the PopTorch model
            before calling :py:meth:`~poptorch.PoplarExecutor.loadExecutable`
            to restore the executable.
        """

        self.compile(*args, **kwargs)
        self.save(filename, export_model)

    def cycleCount(self) -> int:
        """ Returns number of cycles which the IPU ran.

            You must run the model on IPU hardware before calling this method.

            :returns: number of cycles on the IPU for the last modern run. If
              you are using replicas, the returned value represents the first
              number of cycles for the first replica only."""

        # pylint: disable=protected-access
        popart_options = self._options._Popart
        if not popart_options.options['instrumentWithHardwareCycleCounter']:
            err_msg = ("Cycle count logging is disabled. Please set option "
                       "logCycleCount to True to enable.")
            raise _impl.createPoptorchError(err_msg)

        if not self.isCompiled():
            err_msg = ("Please run the model at least once before obtaining "
                       "cycle count.")
            raise _impl.createPoptorchError(err_msg)

        return poptorch_core.cycleCount(self._executable)

    def compilationTime(self) -> timedelta:
        """ Returns the total model compilation time.

            :returns: An object of type datetime.timedelta representing
                the compilation time.

            .. note:: You must compile the model before calling this method
                also showCompilationProgressBar option must be set to True.
            """
        # pylint: disable=protected-access

        if not self.isCompiled():
            err_msg = ("Please compile the model before obtaining "
                       "compilation time.")
            raise _impl.createPoptorchError(err_msg)

        if not self._options._show_compilation_progress_bar:
            err_msg = ("Please set showCompilationProgressBar option "
                       "to obtain compilation time.")
            raise _impl.createPoptorchError(err_msg)

        return self._options._progress_bar.compilation_time

    def __call__(self, *args: List['torch.Tensor'],
                 **kwargs: Dict[str, 'torch.Tensor']):
        """
        Takes the same arguments as the wrapped PyTorch `model.__call__`.

        .. note:: The first time the :py:class:`~poptorch.PoplarExecutor`
            wrapper is called, the wrapped model will be traced and compiled.

        """
        assert self._options.connection_type != enums.ConnectionType.Never, (
            "Trying to run a model on an offline device "
            "(ConnectionType.Never): use model.compile(inputs) instead of"
            " model(inputs)")

        # If it is compiled we take the fast path, if not we convert lists to tuples.
        in_tensors = self._args_parser(args, kwargs, self.isCompiled())

        if not self.isCompiled():
            self._compile(in_tensors)

        if not self._is_attached:
            self.attachToDevice()
        if not self._training:
            # If this is an inference model: check if the same model is not being
            # trained on a different IPU.
            # If it is: make sure the weights are updated.

            copyWeightsToHostIfNeeded = getattr(self._user_model,
                                                "copyWeightsToHostIfNeeded",
                                                None)
            if callable(copyWeightsToHostIfNeeded):
                copyWeightsToHostIfNeeded()
                if self._host_weights_version != \
                        self._user_model._host_weights_version:
                    # Weights have now been updated on the Host: copy them to
                    # the second IPU.
                    logger.debug("Implicit copyWeightsToDevice()")
                    self.copyWeightsToDevice()
                    self._host_weights_version = \
                            self._user_model._host_weights_version

        self._executable_inputs.validateInputs(in_tensors)
        in_tensors_flat = in_tensors.asPackedFlatTuple(self._executable_inputs)

        # Update the optimizer state on the IPU if needed.
        self._write_optim_state_dict_if_needed()
        # Execute the poplar executable with the full size (batch * device interations)
        with self._profiling.tracepoint("modelExecution"):
            output = poptorch_core.execute(self._executable, in_tensors_flat)

        # Any anchored tensors will be returned at the end of the list
        # Pop them out and populate the anchor memory
        long_names = list(self._options.anchored_tensors.values())
        for long_name in reversed(long_names):
            tensor = output.pop()
            keys = [
                key for key, value in self._options.anchored_tensors.items()
                if value == long_name
            ]
            for key in keys:
                self._anchor_memory[key] = tensor

        self._host_rng_state_is_dirty = True
        if self._training:
            self._dirty_host_weights = True

        if self._optimizer and isinstance(self._optimizer, Optimizer):
            # The optimizer has been used on the IPU: its state on the host
            # is now out of date.
            self._optimizer.host_state_is_dirty = True

        # Provide a useful error message if the user attempts to call
        # backward() on an output tensor
        self._assign_backward_error(output)

        if self._outputs_structure is not None:
            # Only return the IPU tensors
            return reconstructTensorStructure(self._outputs_structure, output,
                                              isOnIpu)
        if len(output) == 0:
            return None
        if len(output) > 1:
            return output
        return output[0]

    def _assign_backward_error(self, input):
        def error_on_backward():
            raise _impl.createPoptorchError(
                "backward() cannot be called explicitly on "
                "outputs of a PopTorch model. If you're using a trainingModel, "
                "the backwards pass is performed automatically when invoking "
                "the model. If you're using an inferenceModel, you should use "
                "a trainingModel instead.")

        if isinstance(input, (list, tuple)):
            for element in input:
                self._assign_backward_error(element)
        elif isinstance(input, torch.Tensor):
            input.backward = error_on_backward

    def getPerfCounters(self):
        """Return performance counters for the last execution of the model.

        Return the values (in fractional seconds) of the performance counters
        corresponding to the latest run of the model. The reference point of
        the returned value is undefined, however the difference between values
        is valid.

        The returned object is a dictionary where they keys correspond to each
        of the following events:
        * 'input': the IPU requesting an input tensor
        * 'input_complete': an input tensor having been transferred
        * 'output': the IPU requesting to transmit an output tensor
        * 'output_complete': an output tensor having been transferred

        The values of the dictionary are nested lists. The first level of
        nesting corresponds to an input or output index. The second level list
        contains the actual values as fractional seconds.

        Examples:
        * dict['input'][1][3]: performance counter for the second input
        tensor being requested on the third iteration of the model
        * dict['output_complete'][0][0]: performance counter the first
        output tensor having been transferred on the first iteration of
        the model
        """
        if not self.isCompiled():
            return {
                'input': [[]],
                'input_complete': [[]],
                'output': [[]],
                'output_complete': [[]]
            }

        def normalize(timestamps):
            if len(timestamps) == 0:
                return [[]]
            return timestamps

        values = poptorch_core.getTimestamps(self._executable)
        return {
            'input': normalize(values[0]),
            'input_complete': normalize(values[1]),
            'output': normalize(values[2]),
            'output_complete': normalize(values[3])
        }

    def _computeLatency(self, from_event: str,
                        from_reduce: Callable[[List[float]], float],
                        to_event: str,
                        to_reduce: Callable[[List[float]], float]):
        """Computes latency figures between two performance counters.

        :param from_event: Key for starting performance counter.
        :param from_reduce: Reduction function for starting counters.
        :param to_event: Key for ending performance counter.
        :param to_reduce: Reduction function for ending counters.

        .. seealso:: :py:meth:`~poptorch.PoplarExecutor.getPerfCounters` for
            the list of keys allowed.
        """
        perf_counters = self.getPerfCounters()
        start_times = []
        end_times = []
        durations = []

        num_inputs = len(perf_counters[from_event])
        for step in range(0, len(perf_counters[from_event][0])):
            start_times.append(
                from_reduce([
                    perf_counters[from_event][i][step]
                    for i in range(0, num_inputs)
                ]))

        num_outputs = len(perf_counters[to_event])
        for step in range(0, len(perf_counters[to_event][0])):
            end_times.append(
                to_reduce([
                    perf_counters[to_event][i][step]
                    for i in range(0, num_outputs)
                ]))

        if len(end_times) == 0:
            return (0., 0., 0.)

        # It is possible to have more input timestamps than output timestamps
        # due to other options such as gradient accumulation and output modes.
        # Whatever the case, the number of input ticks will always be divisible
        # by the number of output ticks.
        assert len(start_times) % len(end_times) == 0, \
            "Internal PopTorch error: mismatching number of start timestamps" \
            " and ending timestamps when calculating latency"

        # Find the group of input ticks corresponding to each output tick and
        # replace the whole set by its minimum
        factor = len(start_times) // len(end_times)
        start_groups = [
            min(start_times[i:i + factor])
            for i in range(0, len(start_times), factor)
        ]

        durations = list(
            map(lambda v: v[1] - v[0], zip(start_groups, end_times)))

        avg = sum(durations) / len(durations)
        return (min(durations), max(durations), avg)

    def getHostIpuLatency(self):
        """Return Host-IPU latency for the last execution of the model.

        The Host-IPU latency is the interval of time (in fractional seconds)
        between the first input tensor being requested and the last input
        tensor being transferred to the IPU.

        The result is a tuple containing the minimum, maximum and average
        latency for the iterations corresponding to the latest invocation of
        the model.
        """
        return self._computeLatency('input', min, 'input_complete', max)

    def getComputeLatency(self):
        """Return compute latency for the last execution of the model.

        The compute latency is the interval of time (in fractional seconds)
        between the last input tensor being transferred to the IPU and the
        last output tensor becoming available.

        The result is a tuple containing the minimum, maximum and average
        latency for the iterations corresponding to the latest invocation of
        the model.
        """
        return self._computeLatency('input_complete', max, 'output', max)

    def getIpuHostLatency(self):
        """Return IPU-Host latency for the last execution of the model.

        The IPU-Host latency is the interval of time (in fractional seconds)
        between the first output tensor becoming available and the last output
        tensor being written back to the host.

        The result is a tuple containing the minimum, maximum and average
        latency for the iterations corresponding to the latest invocation of
        the model.
        """
        return self._computeLatency('output', min, 'output_complete', max)

    def getLatency(self):
        """Return round-trip latency for the last execution of the model.

        The round-trip latency is the interval of time (in fractional seconds)
        between the first input tensor being requested and the last output
        tensor being written back to the host.

        The result is a tuple containing the minimum, maximum and average
        latency for the iterations corresponding to the latest invocation of
        the model.
        """
        return self._computeLatency('input', min, 'output_complete', max)

    def destroy(self) -> None:
        """Destroy the model: release the IPUs and the executable.
        """
        if not self.isCompiled():
            return
        if self._training:
            self.copyWeightsToHostIfNeeded()
            # Sync the optimizer's state dict back to host
            self._optimizer.state_dict()

        del self._executable
        self._executable = None

        if not self._training:
            return

        # unwrap the model, parameters and buffers
        if not _impl.isWrapped(self._user_model):
            raise _impl.createPoptorchError("model was never wrapped")
        _impl.unwrapModelIfNecessary(self._user_model)

    def _narrow_tensor(self, tensor):
        """There are two concepts of batch size. First is the "model" batch
        size then there is the concept of batching at the popart level.
        Here we divide by the popart batch size so the trace "sees" the
        model batch size but when we call execute we pass the full batch
        and popart will partition it up."""

        input_group_count = self._options.replication_factor // \
                            self._options.input_group_size
        # Input will be in form of [ModelBatchSize * BatchPerStep, ...] so we
        # should slice it up so we compile by the ModelBatchSize alone.
        extra_poplar_batch_dims = self._options.device_iterations * \
            input_group_count * self._options.Training.gradient_accumulation

        if not isinstance(tensor, torch.Tensor):
            return tensor

        b_size = 1 if not tensor.size() else tensor.size()[0]
        assert b_size % extra_poplar_batch_dims == 0, (
            "Invalid batch dimension: In the input %s, the batch "
            "dimension (%d) must be a multiple of "
            "Options.deviceIterations(%d) * "
            "(Options.replicationFactor(%d) / "
            "Options.inputReplicaGrouping.input_group_size(%d)) * "
            "Options.Training.gradientAccumulation(%d) = %d "
            "because it is used to calculate the batch size which will "
            "be executed on the device in any given iteration. For a "
            "full explanation see the batching semantics page of the "
            "documentation."
        ) % (tensor.shape, b_size, self._options.device_iterations,
             self._options.replication_factor, self._options.input_group_size,
             self._options.Training.gradient_accumulation,
             extra_poplar_batch_dims)
        return tensor if tensor.shape == torch.Size([]) else tensor.narrow(
            0, 0, b_size // extra_poplar_batch_dims)

    def isAttachedToDevice(self) -> bool:
        """Returns true, if the target device has been attached. False,
        otherwise.
        """
        if not self.isCompiled():
            return False

        return poptorch_core.isAttachedToDevice(self._executable)

    def isCompiled(self) -> bool:
        """Returns true if the model has been compiled (and not destroyed).
        False, otherwise."""
        return bool(self._executable)

    def detachFromDevice(self) -> None:
        """Detach from target device. Before calling this function, the device
        must be attached (and the model compiled)."""
        if not self.isCompiled():
            raise _impl.createPoptorchError(NO_EXECUTABLE_ERR)

        if not self._is_attached:
            raise _impl.createPoptorchError("Device is not attached")

        # Read all the states back before detaching
        _ = self.rng_state
        if self._training:
            self.copyWeightsToHostIfNeeded()
            self._read_optim_state_dict_if_needed()

        poptorch_core.detachFromDevice(self._executable)
        self._is_attached = False

    def attachToDevice(self) -> None:
        """Attach to target device. Before calling this function, the device
        must be detached and the model compiled."""
        if not self.isCompiled():
            raise _impl.createPoptorchError(NO_EXECUTABLE_ERR)
        assert self._options.connection_type != enums.ConnectionType.Never, (
            "Trying to attach to an offline device"
            " (ConnectionType.Never)")

        if self._is_attached:
            raise _impl.createPoptorchError("Device is already attached")

        poptorch_core.attachToDevice(self._executable)
        poptorch_core.loadEngineAndConnectStreams(self._executable)
        self._is_attached = True
        self._on_device_attach()


def _registerHook(hooks, new_hook) -> torch.utils.hooks.RemovableHandle:
    handle = torch.utils.hooks.RemovableHandle(hooks)
    hooks[handle.id] = new_hook
    return handle


def registerPreCompileHook(hook: Callable
                           ) -> torch.utils.hooks.RemovableHandle:
    """Register a hook that is called before model compilation.

    Raises a ``RuntimeError` if the hook is not callable.

    :param hook: A callable that is ran before model compilation begins.
    :returns: a :py:class:`torch.utils.hooks.RemovableHandle` that can be used
        to remove the hook using :py:func:`~remove`
    """
    if not callable(hook):
        raise RuntimeError("Pre-compile hook must be callable")
    hooks = PoplarExecutor._precompile_hooks  # pylint: disable=protected-access
    return _registerHook(hooks, hook)


def registerPostCompileHook(hook: Callable
                            ) -> torch.utils.hooks.RemovableHandle:
    """Register a hook that is called after model compilation.

    Raises a ``RuntimeError` if the hook is not callable.

    :param hook: A callable that is ran after model compilation ends.
    :returns: a :py:class:`torch.utils.hooks.RemovableHandle` that can be used
        to remove the hook using :py:func:`~remove`
    """
    if not callable(hook):
        raise RuntimeError("Post-compile hook must be callable")
    hooks = PoplarExecutor._postcompile_hooks  # pylint: disable=protected-access
    return _registerHook(hooks, hook)


================================================
FILE: python/_poptorch_data.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import pickle
from typing import Any, List, Optional

# Do not import any poptorch.* here: it will break the poptorch module
from . import enums


class PoptorchData:
    """Metadata to save when exporting an executable in order to be able
    to reload it.

    Note: :py:func:`~poptorch.load` can only be used if all the arguments are
    provided
    :py:meth:`~poptorch.PoplarExecutor.loadExecutable` can be used in either
    case (But only version and executable_inputs will be used)
    """

    def __init__(self,
                 version: str,
                 executable_inputs: List[Any],
                 options: 'poptorch.Options',
                 training: Optional[bool] = None,
                 model: Optional['torch.nn.Module'] = None,
                 optimizer: Optional['torch.optim.Optimizer'] = None,
                 random_seed: Optional[int] = None,
                 rng_state: Optional[List[int]] = None):
        self.options = options
        self.training = training
        self.model = model

        self.version = version
        self.optimizer = optimizer
        assert executable_inputs, "The executable's inputs are missing"
        self.executable_inputs = executable_inputs
        self.random_seed = random_seed
        self.rng_state = rng_state

    @property
    def optimizer(self):
        return self._optimizer

    @optimizer.setter
    def optimizer(self, opt):
        self._optimizer = opt
        if opt is None:
            self.optimizer_state = None
        else:
            self.optimizer_state = opt.state_dict()


def parse(serialized_data: bytes, expected_version: str):
    """Extract the :py:class:`~poptorch.PoptorchData` and the offset at
       which the PopART executable is stored from a given file.
    """
    data = pickle.loads(serialized_data)
    assert data.version == expected_version, (
        "PopTorch version mismatch: "
        f"File was created with version: {data.version}"
        f" and this is version {expected_version}")
    assert data.executable_inputs, ("Executable inputs are missing")

    if data.options:
        data.options._unfreeze()  # pylint: disable=protected-access
        # Remove usefOfflineIpuTarget related flags if used
        data.options.deleteIfExists("ipu_version")
        if data.options.connection_type == enums.ConnectionType.Never.value:
            data.options.connectionType(enums.ConnectionType.Always)

    return data


================================================
FILE: python/_printing.py
================================================
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.
import torch


# Override torches repr function to provide information on the pre hooks as
# well. The pre hooks is where BeginBlock is added
def module_repr(m: torch.nn.Module):
    """
    Provide a string representation of a torch.nn.Module along with the
    corresponding pre-hooks.

    This will show any BeginBlocks that have been added to the model which
    otherwise wouldn't be displayed.
    """

    def _add_indent(s_, numSpaces):
        return f'\n{numSpaces}'.join(s_.split('\n'))

    # pylint: disable=protected-access

    # We treat the extra repr like the sub-module, one item per line
    extra_lines = []
    extra_repr = m.extra_repr()
    # empty string will be split into list ['']
    if extra_repr:
        extra_lines = extra_repr.split('\n')
    child_lines = []
    for key, module in m._modules.items():
        mod_str = module_repr(module)
        mod_str = _add_indent(mod_str, 2)
        child_lines.append('(' + key + '): ' + mod_str)
    lines = extra_lines + child_lines

    pre_hooks = ''.join(
        map(lambda x: repr(x) + ' ', m._forward_pre_hooks.values()))

    main_str = pre_hooks + m._get_name() + '('
    if lines:
        # simple one-liner info, which most builtin Modules will use
        if len(extra_lines) == 1 and not child_lines:
            main_str += extra_lines[0]
        else:
            main_str += '\n  ' + '\n  '.join(lines) + '\n'

    main_str += ')'
    return main_str


_global_print = print


def print(m):
    """
    Prints a torch.nn.Module along with the corresponding pre-hooks.

    This will print any BeginBlocks that have been added to the model which
    otherwise wouldn't be displayed.
    """
    if isinstance(m, torch.nn.Module):
        _global_print(module_repr(m))
    _global_print(m)


================================================
FILE: python/_utils.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.
import ctypes
import functools
import inspect
import itertools
import json
from typing import List, Generator

import torch

from . import poptorch_core  # type: ignore
from ._logging import logger

ATTR_PREFIX = "attr:"


def deprecated(domain, since_version, reason):
    """Decorator function to mark other functions as deprecated."""

    def deprecated_func(func):
        @functools.wraps(func)
        def wrapped_func(*args, **kwargs):
            logger.warning(
                "%s.%s is deprecated since version %s "
                "and will be removed in a future release.\nReason: %s.",
                domain, func.__name__, since_version, reason)
            return func(*args, **kwargs)

        return wrapped_func

    return deprecated_func


def assert_signatures_match(poptorch_method, reference_method):
    reference_params = inspect.signature(reference_method).parameters
    poptorch_params = inspect.signature(poptorch_method).parameters
    assert poptorch_params == reference_params, (
        "Arguments mismatch: expected "
        f"{reference_params} but got {poptorch_params}")


def accessAttributes(attribute_id_str):
    """Allow access to attributes"""
    logger.debug("Accessing attributes with: %s", attribute_id_str)

    if not isinstance(attribute_id_str, (str)):
        raise ValueError("Wrong type for attribute_id_str")

    # this is to allow creating of attributes from poptorch cpp
    if attribute_id_str.startswith('{'):
        return json.loads(attribute_id_str)

    if not attribute_id_str.startswith(ATTR_PREFIX):
        raise ValueError("Invalid attribute_id_str")

    attribute_id = int(attribute_id_str[len(ATTR_PREFIX):], 16)

    # NB this is undefined behavior if attribute_id does not exist
    attributes = ctypes.cast(attribute_id, ctypes.py_object).value
    logger.debug(str(attributes))

    if attributes is None:
        return {}
    return attributes


def isOnIpu(x):
    return x.device.type == "ipu"


custom_arg_parsers = dict()


def getCustomParser(custom_type_instance):
    if len(custom_arg_parsers) == 0:
        return None

    # direct lookup for exact type inside custom_arg_parsers
    parser = custom_arg_parsers.get(type(custom_type_instance), None)
    if parser is not None:
        return parser

    # search for registered parser for base class of custom_type_instance,
    # iterate over entire dict
    for custom_type, parser in custom_arg_parsers.items():
        if isinstance(custom_type_instance, custom_type):
            return parser

    return None


# Returns the structure `tensors` as a list of its torch.Tensor contents.
def flattenTensorStructure(tensors, canonical_structure=None):
    def flatten(x, c):
        parser = getCustomParser(x)
        if parser is not None:
            yield from parser.yieldTensors(x)
        elif isinstance(x, dict):
            keys = x.keys() if c is None else c.keys()
            for k in keys:
                yield from flatten(x[k], None if c is None else c[k])
        elif isinstance(x, (list, tuple)):
            cl = itertools.repeat(None, len(x)) if c is None else c
            for t, ct in zip(x, cl):
                yield from flatten(t, ct)
        elif isinstance(x, torch.Tensor):
            yield x
        # If it's not a dict/list/tuple or tensor, just ignore it

    return list(flatten(tensors, canonical_structure))


# Turns a flat `values` into the same structure as `structure`.
#
# Any non-tensor values in `structure` will be copied to the output.
#
# filter_fn: Optional function to additionally filter which tensors make it into
#            the output (eg. could supply `isOnIpu` to only get IPU tensors).
def reconstructTensorStructure(structure, values, filter_fn=lambda t: True):
    # Copy the original structure but replace all the tensors by values from the
    # passed iterator.
    def copy_structure(x, it):
        parser = getCustomParser(x)
        if parser is not None:
            return parser.reconstruct(x, it)
        if isinstance(x, dict):
            return type(x)({k: copy_structure(x[k], it) for k in x.keys()})
        if isinstance(x, (tuple, list)):
            if (hasattr(x, '_asdict') and hasattr(x, '_fields')):
                return type(x)(*(copy_structure(e, it) for e in x))
            return type(x)(copy_structure(e, it) for e in x)
        if isinstance(x, torch.Tensor) and filter_fn(x):
            return next(it)
        return x

    return copy_structure(structure, iter(values))


def combine_batch_tensors_gen(tensors: List[List[torch.Tensor]]
                              ) -> Generator[torch.Tensor, None, None]:
    """Concatenated batches tensors along dim = 0.
    """
    for tensor_id in range(len(tensors[0])):
        tensors_list = [
            tensors[batch_id][tensor_id] for batch_id in range(len(tensors))
        ]
        yield torch.cat(tensors_list)


def combined_batch_generator(dataloader_iterator,
                             num_batches_to_combine,
                             drop_last=True):
    """Wraps DataLoader iterator. Generates combined batches by concatenating
    consecutive batches tensors from dataloader_iterator along dim=0.
    """
    tensors_to_combine = []
    batch = None
    # iterate over next data batches
    for batch in dataloader_iterator:
        # append batch tensors to concatenate list
        if len(tensors_to_combine) < num_batches_to_combine:
            tensors_to_combine.append(flattenTensorStructure(batch))
        else:
            # concatenate all tensors from concatenate list - create combined batch
            yield reconstructTensorStructure(
                batch, combine_batch_tensors_gen(tensors_to_combine))
            tensors_to_combine = [flattenTensorStructure(batch)]

    if tensors_to_combine and len(tensors_to_combine) > 0 and \
        len(tensors_to_combine) == num_batches_to_combine or \
        not drop_last:
        # concatenate all tensors from concatenate list - create combined batch
        yield reconstructTensorStructure(
            batch, combine_batch_tensors_gen(tensors_to_combine))


def getIpuTensorId(x: torch.Tensor):
    return poptorch_core.getIpuTensorId(x)


================================================
FILE: python/enums.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import enum


class MeanReductionStrategy(enum.IntEnum):
    """Specify when to divide by a mean reduction factor when
    ``accumulationAndReplicationReductionType`` is set to
    ``ReductionType.Mean``.

    - ``Running``: Keeps the reduction buffer as the current mean. This is
      preferred for numerical stability as the buffer value is never larger than
      the magnitude of the largest micro batch gradient.
    - ``Post``: Divides by the accumulationFactor and replicatedGraphCount after
      all of the gradients have been reduced. In some cases this can be
      faster then using Running, however is prone to overflow.
    - ``PostAndLoss`` (deprecated): Divides by the replicatedGraphCount before
      the backwards pass, performs the gradient reduction across micro batches,
      and then divides by the accumulationFactor. This is to support legacy
      behaviour and is deprecated.
    """
    Running = 0
    Post = 1
    PostAndLoss = 2


class DataLoaderMode(enum.IntEnum):
    """
    - ``Sync``: Access data synchronously
    - ``Async``: Uses an :py:class:`~poptorch.AsynchronousDataAccessor`
      to access the dataset
    - ``AsyncRebatched``: For iterable datasets by default PyTorch will round
      down the number of elements to a multiple of the combined batch size in
      each worker. When the number of workers is high and/or the batch size
      large this might lead to a significant part of the dataset being
      discarded. In this mode, the
      combined batch size used by the PyTorch workers will be set to 1,
      and the batched tensor will instead be constructed in the
      :py:class:`~poptorch.AsynchronousDataAccessor`.
      This mode is identical to Async for map-style datasets.
    """
    Sync = 0
    Async = 1
    AsyncRebatched = 2


class SharingStrategy(enum.IntEnum):
    """Strategy to use to pass objects when creating new processes.

    - ``SharedMemory``: Spawn new processes and share data using shared memory:
                        Fast but limited availability.
    - ``FileSystem``: Spawn new processes and shared data using the file
                      system: slower but larger than memory.
    - ``Fork``: Fork new processes: no data sharing required but might cause
                problems if worker processes use threading.
    - ``ForkServer``: Similar to fork but a server process is used to fork child
                      processes instead. This server process is single-threaded
                      so there are no issues if worker processes use threading.
    """
    SharedMemory = 0
    FileSystem = 1
    Fork = 2
    ForkServer = 3


class OutputMode(enum.IntEnum):
    """
    - ``All``: Return a result for each batch.
    - ``Sum``: Return the sum of all the batches
    - ``Final``: Return the last batch.
    - ``EveryN``: Return every N batches. N is passed in as
        `output_return_period`
    - ``Default``: "All" for inference, "Final" for training.
    """
    Final = 0
    EveryN = 1
    All = 2
    Sum = 3
    Default = 4


class ConnectionType(enum.IntEnum):
    """
    - ``Always``: Attach to the IPU from the start (Default).
    - ``OnDemand``: Wait until the compilation is complete and the executable is
      ready to be run to attach to the IPU.
    - ``Never``: Never try to attach to an IPU. (Useful for offline compilation,
      but trying to run an executable will raise an exception).
    """
    Always = 0
    OnDemand = 1
    Never = 2


class HalfFloatCastingBehavior(enum.IntEnum):
    """
    (deprecated) Only used for tracing.

    HalfUpCastToFloat is now the only supported option.
    """
    FloatDowncastToHalf = 0
    HalfUpcastToFloat = 1
    Default = 2


class ReductionType(enum.IntEnum):
    """
    - ``Sum``: Calculate the sum of all values
    - ``Mean``: Calculate the mean of all values
    - ``NoReduction``: Do not reduce
    """
    Sum = 0
    Mean = 1
    NoReduction = 2


class SyncPattern(enum.IntEnum):
    """
    - ``Full``: Require all IPUs to synchronise on every communication between
      IPUs or between IPUs and host.
    - ``SinglePipeline``: Allow IPUs to synchronise with the host independently,
      without having to synchronise with each other. This permits any one IPU to
      perform host IO while other IPUs are processing data.
    - ``ReplicaAndLadder``: Allow an IPU group to communicate with the host
      without requiring synchronisation between groups. This permits multiple
      IPU groups to alternate between performing host IO and computation.
    """
    Full = 0
    SinglePipeline = 1
    ReplicaAndLadder = 2


class MatMulSerializationMode(enum.Enum):
    """Which dimension of the matrix multiplication to use for the
    serialization"""
    InputChannels = "input_channels"
    ReducingDim = "reducing_dim"
    OutputChannels = "output_channels"
    Disabled = "none"


class Liveness(enum.IntEnum):
    """When using phased execution:

    - ``AlwaysLive``: The tensors always stay on the IPU between the phases.
    - ``OffChipAfterFwd``: The tensors are sent off the chip at the end of
      the forward pass and before the beginning of the backward pass.
    - ``OffChipAfterFwdNoOverlap``: Same as `OffChipAfterFwd`, except there is
      no overlapping of load and store operations between phases. This makes it
      a more memory-efficient mode at the cost of delayed computation.
    - ``OffChipAfterEachPhase``: The tensors are sent off the chip at the end
      of each phase.
    """
    AlwaysLive = 0
    OffChipAfterFwd = 1
    OffChipAfterFwdNoOverlap = 2
    OffChipAfterEachPhase = 3


class OverlapMode(enum.Enum):
    """
    - ``NoOverlap``: The host will copy the tensor to the IPU only when
      required: this minimises on-chip memory use at the cost of performance.
    - ``OverlapAccumulationLoop``: The host will preload values for the next
      gradient accumulation iteration onto an IO tile.
    - ``OverlapDeviceIterationLoop``: The host will preload values not just for
      the next gradient accumulation iteration, but the next device iteration,
      onto an IO tile. This may require more IO tiles than the previous setting
      but offers greater performance.
    - """
    NoOverlap = "no_overlap"
    OverlapAccumulationLoop = "overlap_accumulation_loop"
    OverlapDeviceIterationLoop = "overlap_device_iteration_loop"


class AutoStage(enum.IntEnum):
    """Defines how the stages are automatically assigned to blocks when the user
    didn't explicitly provide stages to the ``IExecutionStrategy``'s
    constructor.

    - ``SameAsIpu``: The stage id will be set to the selected ipu number.
    - ``AutoIncrement``: The stage id for new blocks is automatically
      incremented.

    Examples:

    >>> # Block "0"
    >>> with poptorch.Block(ipu_id=0):
    ...  layer()
    >>> # Block "1"
    >>> with poptorch.Block(ipu_id=1):
    ...  layer()
    >>> # Block "2"
    >>> with poptorch.Block(ipu_id=0):
    ...  layer()

    By default, the following execution strategy is used:

    >>> strategy = poptorch.PipelinedExecution(poptorch.AutoStage.SameAsIpu)
    >>> opts.setExecutionStrategy(strategy)

    which would translate to ``stage_id = ipu_id``:

    - Block "0" ipu=0 stage=0
    - Block "1" ipu=1 stage=1
    - Block "2" ipu=0 stage=0

    Now if instead you use:

    >>> strategy = poptorch.PipelinedExecution(poptorch.AutoStage.AutoIncrement)
    >>> opts.setExecutionStrategy(strategy)

    The last block would be in its own stage rather than sharing one with
    Block "0":

    - Block "0" ipu=0 stage=0
    - Block "1" ipu=1 stage=1
    - Block "2" ipu=0 stage=2
    """
    SameAsIpu = 0
    AutoIncrement = 1


class MultiConvPlanType(enum.IntEnum):
    """Selects the execution strategy for a ``poptorch.MultiConv``

    - ``Parallel``: Execute multiple convolutions in parallel (Default).
    - ``Serial``: Execute each convolution independently. This is
      equivalent to using the independent convolution API.
    """
    Parallel = 0
    Serial = 1


class CommGroupType(enum.IntEnum):
    """Grouping to be used when distributing an input or per-replica variable
       among replicas. See :ref:`grouping_tensor_weights`.

    - ``All``: This causes :py:func:`~replicaGrouping` to have no effect, as the
               same variable value is distributed to all replicas. Group count
               is ignored. This is not valid as an input group type.

    - ``Consecutive``: Each replica group is made up of consecutive replicas,
                       So for group size ``k``, the groups would be set up thus:

                       ``{0, 1, ... k-1}, {k, ... 2k-1} ... {N-k-1, ... N-1}``

    - ``Orthogonal``: Each replica group is made up by slicing the replicas
                      orthogonally to the replica ordering. So for group size
                      ``k``, with group count ``m = N/k``:

                      ``{0, m, 2m, ...}, {1, m+1, 2m+1, ...} ... {m-1, 2m-1,
                      ... N-1}``

    - ``NoGrouping``: Each replica gets its own value of the variable. Group
                      count is ignored.
    """
    All = 0
    Consecutive = 1
    Orthogonal = 2
    NoGrouping = 3


class VariableRetrievalMode(enum.IntEnum):
    """Method to be used when retrieving the value of a grouped variable from
       grouped replicas. See :ref:`grouping_tensor_weights`.

    - ``OnePerGroup``: Return one value for each replica group (takes the value
                       from the first replica in the group).

    - ``AllReplicas``: Return a value from each replica.
    """
    OnePerGroup = 0
    AllReplicas = 2


================================================
FILE: python/ops.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
from collections import OrderedDict
from typing import Callable, Dict, List, Union, Tuple, Optional
import torch

from . import enums
from . import poptorch_core
from . import _impl
from ._utils import ATTR_PREFIX, flattenTensorStructure, reconstructTensorStructure

_end_ipu_block = torch.ops.poptorch.end_ipu_block


def ctc_beam_search_decoder(probs: "torch.Tensor",
                            lengths: "torch.Tensor",
                            blank: int = 0,
                            beam_width: int = 100,
                            top_paths: int = 1) -> List["torch.Tensor"]:
    """Add a connectionist temporal classification (CTC) beam search decoder
       to the model.

    Calculates the most likely top paths and their probabilities given the
    input logarithmic probabilities and the data lengths.

    :param probs: Logarithmic probabilities tensor with the shape
                               of [input_length, batch_size, num_classes].
    :param lengths: Tensor representing lengths of the inputs
                                 of shape [batch_size].
    :param blank: Integer identifier of the blank class (default: 0).
    :param beam_width: Number of beams used during decoding (default: 100).
    :param top_paths: Number of most likely paths to return (default: 1).
    :returns: Three tensors representing paths' probabilities - of shape
              [batch_size, top_paths], paths' lengths - of shape
              [batch_size, top_paths] and the decoded paths - of shape
              [batch_size, top_paths, input_length].
    """
    if not isinstance(probs, torch.Tensor):
        raise _impl.createPoptorchError(
            "ctc_beam_search_decoder: probs must be a torch.tensor argument. "
            f"{type(probs)} is not supported.")
    if not isinstance(lengths, torch.Tensor):
        raise _impl.createPoptorchError(
            "ctc_beam_search_decoder: lengths must be a torch.tensor argument. "
            f"{type(lengths)} is not supported.")
    return torch.ops.poptorch.ctc_beam_search_decoder(probs, lengths, blank,
                                                      beam_width, top_paths)


def ipu_print_tensor(tensor: "torch.Tensor",
                     title: str = "",
                     print_gradient: bool = True,
                     summarise_threshold: int = 1000,
                     edge_items: int = 3,
                     max_line_width: int = 80,
                     digits: int = 4,
                     float_format: str = "auto",
                     separator: str = ", ",
                     open_bracket: str = "(",
                     close_bracket: str = ")") -> "torch.Tensor":
    """Adds an op to print the contents of the IPU tensor.

    When this is executed the tensor
    will be copied back to host and printed.

    When this operation is called in the backward pass it
    will print the gradient of the tensor.

    The operation is an identity operation and will return the exact same
    tensor. The returned tensor must be used in place of the original tensor
    in the rest of the program, to make sure that the print operation isn't
    optimised away.

    For example, if the original code looks like this:

    .. code-block:: python

      def forward(self, c, d, b)
        a = c + d
        return a + b

    If the result of ``ipu_print_tensor()`` is not used, the function will be
    optimised out by the graph optimiser and the tensor will not be printed.

    So if you want to print the value of `a`, you should do:

    .. code-block:: python

      def forward(self, c, d, b)
        a = c + d
        x = poptorch.ipu_print_tensor(a)
        return x + b

    Optionally, you can add a second string argument to be used as a title, as
    shown in the following example.
    The value of `a` will be printed after the title "summation". The value of
    the gradient of `a` will be printed after the title "summation_gradient" if
    the operation is called in the backward pass.

    .. code-block:: python

      def forward(self, c, d, b)
          a = c + d
          x = poptorch.ipu_print_tensor(a, "summation"))
          return x + b


    .. warning::
       To prevent the print operation being optimised out by the graph
       optimiser, you must use the output of the print.

    :param tensor: The tensor to print.
    :param title: An optional title to print before the tensor value.
        Defaults to "".
    :param print_gradient: Whether to print the gradient tensor associated
        with this tensor. Defaults to True.
    :param summarise_threshold: If the number of elements of the
        tensor exceeds this threshold the output will be summarised. Only the
        edge elements will be displayed with an ellipsis indicating skipped
        elements. A value of 0 will disable summarisation. Defaults to 1000.
    :param edge_items: Number of edge elements to include at the
        beginning and end when summarisation is enabled. Defaults to 3.
    :param max_line_width: Lines longer than this limit will be split
        across multiple lines. A value of 0 will disable line splitting.
        Defaults to 75.
    :param digits: Number of digits to display. For integers this limit can be
        exceeded if any number is large enough. For floating points this does
        not include the exponent. The number of digits is used in conjunction
        analysis of the tensor to determine the width of each element to align
        all elements when printed. A value of 0 disables this analysis and each
        elements will be printed in an unaligned format. Defaults to 4.
    :param float_format: Determines the floating point format to use. Automatic
        mode determines the appropriate format based on the data.
        Defaults to "auto".
        One of:
        - "auto": Automatically determine the format through analysis.
        - "fixed": Use fixed point e.g. -100.00.
        - "scientific": Use scientific notation e.g. -1.123e+10.
        - "none": Do not display all elements with the same format
    :param separator: Character used to delineate values. Defaults to " ".
    :param open_bracket: Character used to open a tensor. Defaults to "[".
    :param close_bracket: Character used to close a tensor. Defaults to "]".
    :returns: The input tensor unchanged.
    """
    if not isinstance(tensor, torch.Tensor):
        raise _impl.createPoptorchError(
            "ipu print tensor must take a torch.tensor argument. "
            f"{type(tensor)} is not supported.")
    float_format_dict = {"auto": 0, "fixed": 1, "scientific": 2, "none": 3}
    return torch.ops.poptorch.ipu_print_tensor(tensor, title,
                                               int(print_gradient),
                                               summarise_threshold, edge_items,
                                               max_line_width, digits,
                                               float_format_dict[float_format],
                                               separator, open_bracket,
                                               close_bracket)


def for_loop(count: int,
             body: Callable[[List['torch.Tensor']], List['torch.Tensor']],
             inputs: List['torch.Tensor']) -> List['torch.Tensor']:
    """An on-device for loop. This loop will execute on device for `count`
    number of iterations.

    The body should be a Python function containing the PyTorch code you
    wish to execute in a loop. It should take as input the same number of
    tensors as it outputs. Each iteration will have the previous output
    passed in as input.

    :param count: Number of iterations of the loop.
    :param body: The function to be executed.
    :param inputs: The initial inputs to the function.
    """

    if not isinstance(inputs, list):
        raise ValueError(("poptorch.for_loop expects input tensors (inputs)"
                          " to be a list of tensors. (Object is not list)"))

    for ind, tensor in enumerate(inputs):
        if not isinstance(tensor, torch.Tensor):
            raise ValueError(
                ("poptorch.for_loop expects input tensors (inputs) to be"
                 " a list of tensors. (Object contained in list at index"
                 " %d is not torch.tensor)") % ind)

    # Clone the inputs to make sure ir reflects the fact that
    # body inputs are passed by value rather than by reference.
    cloned_inputs = [t.clone() for t in inputs]

    # Start the for loop.
    torch.ops.poptorch.start_for_loop(cloned_inputs)
    outputs = body(*cloned_inputs)
    if not isinstance(outputs, list) and not isinstance(outputs, tuple):
        outputs = [outputs]

    # End the for loop.
    res = torch.ops.poptorch.end_for_loop(outputs, cloned_inputs, count)

    return res


def cond(condition: 'torch.Tensor',
         then_body: Callable[[List['torch.Tensor']], List['torch.Tensor']],
         then_inps: List['torch.Tensor'],
         else_body: Callable[[List['torch.Tensor']], List['torch.Tensor']],
         else_inps: List['torch.Tensor']) -> List['torch.Tensor']:
    """An on-device if/else operation. This creates two branches of instructions
    executed conditionally on the device. Only for inference.

    The `then_body` and `else_body` should be Python functions containing the
    PyTorch code you wish to execute conditionally on the device. The condition
    is passed in the form of a boolean `Tensor` and the branch to be executed is
    decided in runtime directly on the device. There are a few conditions on the
    branch functions:

    * `then_body` and `else_body` can accept an arbitrary number of inputs
      (including zero).
    * Tensors defined in the `cond` caller (the outer graph) can be used inside
      `then_body` and `else_body` implicitly just as if they were passed
      through the inputs list.
    * `then_body` and `else_body` have to return the same number of
      corresponding outputs. This is because the result of the `cond` op is
      assigned to a common list of tensors.
    * all the tensors utilized by `then_body` and `else_body` are passed in by
      copy, so updating any of the tensors inside `then_body` and `else_body`
      does not affect the original tensors. To update a tensor passed in, its
      new value has to be returned from the body and assigned to the original
      tensor (please note that the number of outputs from `then_body` and
      `else_body` has to match).

    :param condition: The condition controlling the execution of `then_body` and
        `else_body`.
    :param then_body: The function to be executed if `condition` is True.
    :param then_inps: `then_body` input tensors.
    :param else_body: The function to be executed if `condition` is False.
    :param else_inps: `else_body` input tensors.
    """

    if not isinstance(then_inps, list) or not isinstance(else_inps, list):
        raise ValueError(
            ("poptorch.cond expects then_inps and else_inps tensors"
             " to be a list of tensors. (Object is not list)"))

    if not _impl.isRunningOnIpu():
        # CPU execution path
        if condition:
            res = then_body(*then_inps)
            return [res] if isinstance(res, torch.Tensor) else [*res]
        res = else_body(*else_inps)
        return [res] if isinstance(res, torch.Tensor) else [*res]

    # Clone the inputs to make sure ir reflects the fact that
    # body inputs are passed by value rather than by reference.
    cloned_condition = condition.clone()

    # Start the if block.
    torch.ops.poptorch.start_if_block(cloned_condition)

    outputs_then = then_body(*then_inps)
    if not isinstance(outputs_then, list) and not isinstance(
            outputs_then, tuple):
        outputs_then = [outputs_then]

    # Start the else block.
    torch.ops.poptorch.start_else_block(outputs_then)

    outputs_else = else_body(*else_inps)
    if not isinstance(outputs_else, list) and not isinstance(
            outputs_else, tuple):
        outputs_else = [outputs_else]

    return torch.ops.poptorch.end_if_block(outputs_else, cloned_condition)


def nop(tensor: "torch.Tensor") -> "torch.Tensor":
    """A no-operation: it is functionally the same as an identity but is never
    eliminated by PopART patterns or inlining, so it is useful for
    debugging.

    :param tensor: The tensor to pass to the no-op.
    :returns: The same tensor which was input.
    """
    if not isinstance(tensor, torch.Tensor):
        raise _impl.createPoptorchError(
            f"nop must take a torch.tensor argument. {type(tensor)} is not "
            "supported.")
    return torch.ops.poptorch.nop(tensor)


def dynamic_slice(tensor: "torch.Tensor", dim: int, start: "torch.Tensor",
                  size: int, step: int) -> "torch.Tensor":
    """Torch native dynamic slices can't be properly intercepted by backends,
    so this op is provided to enable dynamic slicing in poptorch applications.

    :param tensor: The tensor to slice.
    :param dim: The dimension to slice along.
    :param start: The start index.
    :param size: The slice size. Must be a constant int.
    :param step: The slice step. Must be a constant int.
    :returns: The sliced tensor.
    """
    if not isinstance(tensor, torch.Tensor):
        raise _impl.createPoptorchError(
            f"dynamic_slice must take a torch.tensor input. {type(tensor)} is "
            "not supported.")
    if not isinstance(dim, int):
        raise _impl.createPoptorchError("Dimension must be an integer.")
    if not isinstance(start, torch.Tensor):
        raise _impl.createPoptorchError(
            "Slice start argument to dynamic_slice must be a torch.tensor. "
            f"{type(tensor)} is not supported.")
    if not isinstance(size, int):
        raise _impl.createPoptorchError("Size must be an integer.")
    if not isinstance(step, int):
        raise _impl.createPoptorchError("Step must be an integer.")
    return torch.ops.poptorch.dynamic_slice(tensor, dim, start, size, step)


def dynamic_update(input: "torch.Tensor", src: "torch.Tensor", dim: int,
                   start: "torch.Tensor", size: int) -> "torch.Tensor":
    """Torch native dynamic slices can't be properly intercepted by backends,
    so this op is provided to enable dynamic update slice in poptorch
    applications.

    :param input: The tensor to update.
    :param src: The tensor to embed into `input`
    :param dim: The dimension to slice along.
    :param start: The start index.
    :param size: The slice size. Must be a constant int.
    :returns: The sliced tensor.
    """
    if not isinstance(input, torch.Tensor):
        raise _impl.createPoptorchError(
            f"dynamic_update must take a torch.tensor input. {type(input)} is "
            "not supported.")
    if not isinstance(dim, int):
        raise _impl.createPoptorchError("Dimension must be an integer.")
    if not isinstance(start, torch.Tensor):
        raise _impl.createPoptorchError(
            "Slice start argument to dynamic_update must be a torch.tensor. "
            f"{type(start)} is not supported.")
    if not isinstance(src, torch.Tensor):
        raise _impl.createPoptorchError(
            "Src argument to dynamic_update must be a torch.tensor. "
            f"{type(src)} is not supported.")
    if not isinstance(size, int):
        raise _impl.createPoptorchError("Size must be an integer.")
    if input.dim() != src.dim():
        raise _impl.createPoptorchError(
            "input and src tensors must have same dimensionality. "
            f"({input.dim()}) vs ({src.dim()})")
    if input.dtype != src.dtype:
        raise _impl.createPoptorchError(
            "input and src tensor must have same dtype. "
            f"({input.dtype} vs {src.dtype})")
    return torch.ops.poptorch.dynamic_update(input, src, dim, start, size)


def recomputationCheckpoint(*tensors: List["torch.Tensor"]
                            ) -> List["torch.Tensor"]:
    """Operation for checkpointing values in a computational pipeline stage.

    When recomputation is enabled, these values will not be recomputed and they
    will be stored in memory between forward and backwards passes instead.

    :param tensors: One or more tensors which should be check-pointed.
    :return: Tensors (same number and shape as the input tensors).
    """

    # Allow passing a single list or tuple
    if len(tensors) == 1:
        if isinstance(tensors[0], (tuple, list)):
            return type(tensors[0])(recomputationCheckpoint(*tensors[0]))

    out = []
    for t_in in tensors:
        if not isinstance(t_in, torch.Tensor):
            raise ValueError("All inputs must be tensors")

        out.append(torch.ops.poptorch.recomputation_checkpoint(t_in))

    if len(out) == 1:
        return out[0]

    # Return a tuple by default since PopTorch does not support list inputs
    return tuple(out)


def serializedMatMul(lhs: "torch.Tensor",
                     rhs: "torch.Tensor",
                     mode: "poptorch.MatMulSerializationMode",
                     factor: int = 0,
                     keep_precision: bool = False) -> "torch.Tensor":
    """ Calculates a matrix product using a serialized matrix multiplication.

    The matrix multiplication, ``lhs*rhs``, is split into separate smaller
    multiplications, calculated one after the other, to reduce the memory
    requirements of the multiplication and its gradient calculation.

    :param lhs: Left-hand side input matrix.
    :param rhs: Right-hand side input matrix.
    :param mode: Which dimension of the matmul
        to serialize on: for matrix A (m by n) multiplied by matrix B (n by p).

        * InputChannels: Split across the input channels (dimension m).
        * ReducingDim: Split across the reducing dimension (n).
        * OutputChannels: Split across the output channels (dimension p).
        * Disabled: Same as an ordinary matrix multiplication.
    :param factor: Number of serialized multiplications. Must be a factor of
        the dimension to serialize on.
    :param keep_precision: (Half/float16 inputs only) The forward op when
        serializing over ReducingDim and the backwards ops when serializing over
        InputChannels involve an addition step. If ``keep_precision`` is True,
        these additions will occur using float32 rather than half precision
        partials, matching those used for the individual matrix multiplications.
   """
    assert isinstance(keep_precision, bool)
    assert isinstance(factor, int)
    assert isinstance(mode, enums.MatMulSerializationMode)
    out = torch.matmul(lhs, rhs)
    return torch.ops.poptorch.set_matmul_serialization(out, mode.value, factor,
                                                       keep_precision)


def set_available_memory(tensor: "torch.Tensor",
                         available_memory_proportion: float) -> "torch.Tensor":
    """Sets the amount of temporary memory made available to an operation.

    The operators that can be tuned with this setting include:

    * convolution
    * matrix multiplication
    * embedding lookups
    * indexing operations

    When applied to the output of a supported operation, it controls the
    trade-off between execution cycles and the temporary memory used during the
    execution of the operation.

    The value should be between 0 and 1 (inclusive) and represents a proportion
    of available memory on the IPU. The default value is 0.6 (therefore, by
    default, PopTorch will not use more than 60% of IPU memory for temporary
    data).

    PopTorch passes this setting to the PopLibs operator planner, which will
    try to constrain the use of temporary memory to below this value. Generally,
    an operation that has more temporary memory available will run in fewer
    cycles.

    For a specific operation, the necessary amount of temporary memory may be
    more than amount specified by this option. In this case, a warning message
    will be generated.

    For more information, please refer to the `technical note
    <https://docs.graphcore.ai/projects/available-memory/en/latest/>`_ on
    optimising temporary memory usage.

    >>> class BasicNetwork(nn.Module):
    ...     def __init__(self):
    ...         super().__init__()
    ...         self.conv = nn.Conv2d(4, 4, 3, stride=2)
    ...
    ...     def forward(self, x):
    ...         out = self.conv(x)
    ...         out = poptorch.set_available_memory(out, 0.2)
    ...         return out

    :param tensor: Output tensor from a supported operation (otherwise the
        statement will be an identity).
    :param available_memory_proportion: Proportion between 0.0 and 1.0
        of tile memory to be made available for temporary memory (default 0.6).
    :returns: The input tensor, as if calling an identity function.
     """
    if not isinstance(tensor, torch.Tensor):
        raise _impl.createPoptorchError(
            "You may only set available memory for torch.tensor values. "
            f"{type(tensor)} is not supported.")
    return torch.ops.poptorch.set_available_memory(
        tensor, available_memory_proportion)


def set_overlap_for_input(input_tensors, mode: "poptorch.OverlapMode"):
    """Sets host overlap setting for input_tensors.

    You can increase performance in some cases by overlapping the copying
    from the host to IPUs with computation. However, this requires a number
    of IPU tiles to be set aside as IO tiles using
    :py:func:`~poptorch.options._TensorLocationOptions.numIOTiles` which may
    affect computation performance.

    You should use this function at the start of your model's `forward` method
    for each applicable input and use the returned tensors in future ops.

    :param input_tensors: The input tensors for which enable overlapping host
      IO. This can be either a single tensor, or any combination of tuple,
      list, or dict of tensors.
    :param mode: Control to what extent the host IO overlaps computation.
    :returns: the input tensors, specified for overlap.

    .. seealso:: :py:class:`~poptorch.OverlapMode`.
    """

    def set_overlap_for_input_tensor(tensor):
        if not isinstance(tensor, torch.Tensor):
            raise _impl.createPoptorchError(
                "You may only set overlap for torch.tensor inputs. "
                f"{type(tensor)} is not supported.")
        return torch.ops.poptorch.set_overlap_for_input(tensor, mode.value)

    flattened = flattenTensorStructure(input_tensors)
    return reconstructTensorStructure(
        input_tensors, map(set_overlap_for_input_tensor, flattened))


def set_overlap_for_output(output_tensors, mode: "poptorch.OverlapMode"):
    """Sets host overlap setting for output_tensors.

    You can increase performance in some cases by overlapping the copying
    from the IPUs to host with computation. However, this requires a number
    of IPU tiles to be set aside as IO tiles using
    :py:func:`~poptorch.options._TensorLocationOptions.numIOTiles` which may
    affect computation performance.

    You should use this function at the end of your model's `forward` method,
    for each applicable output, just before returning the tensors.

    :param output_tensors: The output tensors to enable overlapping host
      IO for. This can be either a single tensor, or any combination of tuple,
      list, or dict of tensors.
    :param mode: Control to what extent the host IO overlaps computation.
    :returns: the output tensors, specified for overlap.

    .. seealso:: :py:class:`~poptorch.OverlapMode`.
    """

    def set_overlap_for_output_tensor(tensor):
        if not isinstance(tensor, torch.Tensor):
            raise _impl.createPoptorchError(
                "You may only set overlap for torch.tensor outputs. "
                f"{type(tensor)} is not supported.")
        return torch.ops.poptorch.set_overlap_for_output(tensor, mode.value)

    flattened = flattenTensorStructure(output_tensors)
    return reconstructTensorStructure(
        output_tensors, map(set_overlap_for_output_tensor, flattened))


def _assertIdIsValid(name, value, expected_type):
    assert isinstance(value, expected_type) or \
            (isinstance(value, int) and value >= 0), (
                f"{name} must be either a positive integer or a "
                f"{expected_type.__name__}")


# The next two classes do not implement the forward method
# pylint: disable=abstract-method


class Block(torch.nn.Module):
    """ A context manager to define blocks of the model.

    You can use ``Block`` as a context manager. This means you use Python's
    "with" statement as follows:

    >>> with poptorch.Block("Encoder"):
    ...     self.layer = MyLayer(x)

    All layers called inside this scope will run on the specified IPU, if
    one is specified. In addition, you can combine multiple blocks into
    a stage.

    .. seealso:: :py:meth:`~poptorch.Options.setExecutionStrategy`

    """
    # Will be set by the ExecutionStrategy before the graph is traced.
    # If it's None then it means it's a CPU execution of the graph so
    # turn the whole class into a no-op.
    _stages_manager = None

    @staticmethod
    def useAutoId():
        """Call this method at the beginning of your ``forward()`` method to
        enable automatic block ID generation.

        Blocks with a None ``user_id`` will be assigned an automatic ID
        which will be the index of this block in the list of ID-less Blocks.

        >>> poptorch.Block.useAutoId()
        >>> with poptorch.Block(): # user_id = "0"
        ...     layer()
        >>> with poptorch.Block("special_block"): # user_id = "special_block"
        ...     layer()
        >>> with poptorch.Block(): # user_id = "1"
        ...     layer()
        """
        if Block._stages_manager is not None:
            Block._stages_manager.resetAutoId()

    @staticmethod
    def start(user_id: Optional[str] = None, ipu_id: Optional[int] = None):
        if Block._stages_manager is not None:
            Block._stages_manager.beginStage(user_id, ipu_id)

    def __init__(self,
                 user_id: Optional[str] = None,
                 ipu_id: Optional[int] = None):
        """

        :param user_id: A user defined identifier for the block.
            Blocks with the same ID are considered as being a single block.
            Block identifiers are also used to manually specify pipelines or
            phases.
        :param ipu_id: The ID of the IPU to run on.
                       Note that the ``ipu_id`` is an index
                       in a multi-IPU device within PopTorch, and is
                       separate and distinct from the device ids used by
                       ``gc-info``.
        """
        super().__init__()
        self._user_id = user_id
        self._ipu_id = ipu_id

    def __enter__(self):
        Block.start(self._user_id, self._ipu_id)

    def __exit__(self, type, value, traceback):
        _end_ipu_block()


# Used to allow BeginBlock to be used with a function
class LegacyBeginBlockFn(torch.nn.Module):
    def __init__(self, layer_to_call, user_id=None, ipu_id=None):
        super().__init__()
        self._user_id = user_id
        self._layer_to_call = layer_to_call
        self._ipu_id = ipu_id

    def __call__(self, *input, **kwargs):
        if Block._stages_manager is not None:
            if self._user_id is None:
                self._user_id = Block._stages_manager.nextAutoId()
            Block._stages_manager.beginStage(self._user_id, self._ipu_id)
        out = self._layer_to_call(*input, **kwargs)
        return out


class _BlockHook():
    """ A hook to define the blocks of the model.

    You can use ``_BlockHook`` as a forward_pre_hook for a ``torch.nn.Module``
    as follows:
    >>> m.register_forward_pre_hook(_BlockHook(user_id, ipu_id))

    All layers called after the hook has run will be run on the specified IPU,
    if one is specified. In addition, you can combine multiple blocks into a
    stage.

    .. seealso:: :py:meth:`~poptorch.Options.setExecutionStrategy`
    """

    def __init__(self, user_id, ipu_id) -> None:
        super().__init__()
        self._user_id = user_id
        self._ipu_id = ipu_id

    def __call__(self, module, input):
        if Block._stages_manager is not None:
            if self._user_id is None:
                self._user_id = (Block._stages_manager.nextAutoId())
            Block._stages_manager.beginStage(self._user_id, self._ipu_id)

    def __repr__(self):
        return f"BeginBlock(user_id={self._user_id}, ipu_id={self._ipu_id})"


def removeBlocks(module):
    """Recursively remove BeginBlock annotations from a Module if it
    contains any.

    :param torch.nn.Module module: Module to recursively unwrap.
    """
    assert isinstance(module, torch.nn.Module)
    for m in module.modules():
        # pylint: disable=protected-access
        m._forward_pre_hooks = OrderedDict(
            filter(lambda elt: not isinstance(elt[1], _BlockHook),
                   m._forward_pre_hooks.items()))


def BeginBlock(layer_to_call: torch.nn.Module,
               user_id: str = None,
               ipu_id: int = None) -> torch.nn.Module:
    """
    Define a block by modifying an existing PyTorch module.

    You can use this with an existing PyTorch module instance, as follows:

    >>> poptorch.BeginBlock(myModel.a_layer)
    >>> poptorch.BeginBlock(MyNewLayer())

    The module and all sub-modules will be part of this block until a
    sub-module is modified to be in another block. In addition, if an IPU is
    specified, the module and its submodules will run on the specified IPU.

    You can combine multiple blocks into a stage.

    :param layer_to_call: PyTorch module to assign to the block.
    :param user_id: A user defined identifier for the block.
            Blocks with the same ID are considered as being a single block.
            Block identifiers are also used to manually specify pipelines or
            phases.
    :param ipu_id: The ID of the IPU to run on.
                   Note that the ``ipu_id`` is an index in a multi-IPU device
                   within PopTorch, and is separate and distinct from the device
                   IDs used by ``gc-info``.

    .. seealso:: :py:meth:`~poptorch.Options.setExecutionStrategy`
    """

    if not isinstance(layer_to_call, torch.nn.Module):
        # Previously, the function returned a new model so would work for any
        # callable. This was never documented but should still be permitted to
        # work.
        if callable(layer_to_call):
            return LegacyBeginBlockFn(layer_to_call, user_id, ipu_id)

        raise _impl.createPoptorchError(
            "module is not an instance of torch.nn.Module or " + "function.")

    # pylint: disable=protected-access
    if any(
            isinstance(hook, _BlockHook)
            for hook in layer_to_call._forward_pre_hooks.values()):
        raise _impl.createPoptorchError(
            "module has already been assigned to a block.")

    layer_to_call.register_forward_pre_hook(_BlockHook(user_id, ipu_id))

    # There is no need to return as it is passed by reference, but this is for
    # backward compatibility
    return layer_to_call


# pylint: enable=abstract-method


def BlockFunction(user_id: Optional[str] = None, ipu_id: Optional[int] = None):
    """ A decorator to define blocks of the model.

    You can use ``BlockFunction`` as a decorator for an existing function, as
    follows:

    >>> @BlockFunction("Decoder", ipu_id=1)
    ... def decoder(self, encoder_output):
    ...     self.decoder_b1(encoder_output)

    All layers inside the function and any functions called by the function will
    run on the specified IPU, if one is specified. In addition, you can combine
    multiple blocks into a stage.

    :param user_id: A user defined identifier for the block.
        Blocks with the same ID are considered as being a single block.
        Block identifiers are also used to manually specify pipelines or
        phases.
    :param ipu_id: The ID of the IPU to run on.
                   Note that the ``ipu_id`` is an index
                   in a multi-IPU device within PopTorch, and is
                   separate and distinct from the device IDs used by
                   ``gc-info``.

    .. seealso:: :py:meth:`~poptorch.Options.setExecutionStrategy`
    """

    def decorator(func):
        def wrapper(*args, **kwargs):
            with Block(user_id, ipu_id):
                return func(*args, **kwargs)

        return wrapper

    return decorator


# Store all attributes to prevent garbage collection
attributes_lists: List[Dict[str, Union[float, int, str, list, tuple]]] = []


def custom_op(inputs: Tuple["torch.Tensor"],
              name: str,
              domain: str,
              domain_version: int,
              example_outputs: Tuple["torch.Tensor"],
              attributes: Optional[
                  Dict[str, Union[float, int, str, list, tuple]]] = None
              ) -> List["torch.Tensor"]:
    """Applies a custom operation, implemented within PopART, to the inputs.

    :param tuple inputs: A tuple of input tensors, for example, (x, y).
    :param str name: Unique name of the PopART custom op.
    :param str domain: Domain for the op.
    :param int domain_version: Version of the domain to use.
    :param iterable example_outputs: A tuple of tensors with the same type
        and shape as the outputs. The value does not matter as all values will
        be set to zero for tracing purposes.
    :param dict attributes: A dictionary of attributes for the custom op. All
        attribute keys must be strings. All attribute values must be floats,
        ints, strings, or a list/tuple containing only floats, only ints or only
        strings (not a mix of types within the list).

    :returns: The outputs of the forward op of the custom op.
    """
    transformed_outputs = []
    for output in example_outputs:
        # Dead code which will get eliminated but will safely allow the same
        # input to be provided to example_output (since it is only supposed
        # to be a template). Otherwise the compiler may recognise the alias.
        grad = output.requires_grad
        transformed_outputs.append(
            torch.zeros_like(output, requires_grad=grad, device=output.device))

    if attributes is not None:
        # Handle attributes list
        for k, v in attributes.items():
            if not isinstance(k, (str)):
                raise ValueError("All attribute keys must be strings.")
            if not isinstance(v, (float, int, str, list, tuple)):
                raise ValueError("Attribute values must be floats, ints, "
                                 "strings or a list/tuple of float, ints of "
                                 "strings.")

            if isinstance(v, (list, tuple)):
                for element in v:
                    if not isinstance(element, (type(v[0]))):
                        raise ValueError("The types in a list/tuple "
                                         "attribute must all be the same.")

        # Non-ascii cannot be converted to std::string in C++
        def error_on_non_ascii(s):
            if isinstance(s, (list, tuple)):
                for v in s:
                    error_on_non_ascii(v)

            if not isinstance(s, str):
                return

            for ch in s:
                if ord(ch) >= 128:
                    raise ValueError(f"{s} contains non-ASCII characters.")

        for k in attributes.keys():
            error_on_non_ascii(k)

        for v in attributes.values():
            error_on_non_ascii(v)

        # The id should not change between traces, so we need to re-use any
        # attribute dictionaries. This more complicated because equality of
        # values is insufficient: [1, 2, 3] == [1.0, 2.0, 3.0]
        def same_attribute_types(candidate_att, search_attr):
            sorted_keys = sorted(candidate_att.keys())
            if sorted_keys != sorted(search_attr.keys()):
                return False

            for key in sorted_keys:
                candidate = candidate_att[key]
                search = search_attr[key]
                if not isinstance(candidate, (type(search))):
                    return False
                if isinstance(candidate, (list, tuple)):
                    if not isinstance(candidate[0], type(search[0])):
                        return False
            return True

        for attrib_cand in attributes_lists:
            if attrib_cand != attributes:
                continue

            # Equality does not imply same types
            if not same_attribute_types(attrib_cand, attributes):
                continue

            attributes = attrib_cand
            break
        else:
            attributes_lists.append(attributes)

    # NB None is a singleton in Python
    attributes_id_str = f"{ATTR_PREFIX}{hex(id(attributes))}"

    return torch.ops.poptorch.custom_operation(inputs, name, domain,
                                               domain_version,
                                               len(transformed_outputs),
                                               transformed_outputs,
                                               attributes_id_str)


class CPU:
    """Allow the execution of a CPU op in the middle of an inference IPU graph.

    .. important:: CPU ops are only supported in inference graphs.

    Example:

    >>> class Model(torch.nn.Module):
    >>>     def __init__(self):
    >>>         super().__init__()
    >>>         self.cpu = poptorch.CPU(self.myCpuOp, "MyCPUOp")
    >>>
    >>>     def myCpuOp(self, x):
    >>>         return x * 2.0
    >>>
    >>>     def forward(self, x):
    >>>         # The arguments passed to "cpu" are forwarded to "myCpuOp"
    >>>         out = self.cpu(x)
    >>>         out = self.cpu(out)
    >>>         out = self.cpu(out)
    >>>         return out
    """

    def __init__(self, layer_to_call: Callable, ID: str):
        """
        Execute a given function on the CPU.

        :param: layer_to_call Python function to execute on the CPU. The
                              arguments passed when the CPU wrapper is called
                              will be forwarded to layer_to_call.
        :param: ID            Name of the CPU op.
        """
        self._layer_to_call = layer_to_call

        if isinstance(self._layer_to_call, torch.nn.Module):
            self._layer_to_call.requires_grad_(False)

        self._ID = ID

        self.in_shapes = None
        self.out_shapes = None

        self.inputs = None
        self.outputs = None

    def execute(self):
        """Implementation detail."""
        outs = self._layer_to_call(*self.inputs)

        if isinstance(outs, (list, tuple)):
            for persistent_output, output in zip(self.outputs, outs):
                persistent_output.copy_(output)
        else:
            self.outputs[0].copy_(outs)

    def registerPersistentData(self):
        """Implementation detail."""

        self.inputs = [torch.zeros(i, device='cpu') for i in self.in_shapes]
        self.outputs = [torch.zeros(o, device='cpu') for o in self.out_shapes]

        poptorch_core.registerBuffersWithCallback(self._ID, self.inputs,
                                                  self.outputs)

    def __call__(self, *input, **kwargs):
        """Implementation detail."""
        # Mark all subsquent ops as happening on the host.
        torch.ops.poptorch.call_cpu_op([*input], self._ID)

        if _impl.isRunningOnIpu():
            cpu_input = [
                torch.zeros_like(i,
                                 device="cpu",
                                 requires_grad=i.requires_grad) for i in input
            ]
        else:
            cpu_input = input

        # Keep the trace happy & get output shapes by actually calling the
        # layer.
        cpu_outputs = self._layer_to_call(*cpu_input)

        # Did we originally just output a single tensor?
        originally_single_tensor = False

        # Slight fixup for single tensor outputs.
        if not isinstance(cpu_outputs, (list, tuple)):
            originally_single_tensor = True
            cpu_outputs = [cpu_outputs]

        # Record metadata for our inputs & outputs, to later allocate in
        # permanent buffers.
        self.in_shapes = [i.shape for i in input]
        self.out_shapes = [o.shape for o in cpu_outputs]

        if _impl.isRunningOnIpu():
            outputs = [
                torch.zeros_like(o,
                                 device="ipu",
                                 requires_grad=o.requires_grad)
                for o in cpu_outputs
            ]
        else:
            outputs = cpu_outputs

        # End CPU host execution and show the JIT what the output looks like.
        outputs = torch.ops.poptorch.end_cpu_op(outputs)

        # Register this callback with poptorch so it knows what to call.
        poptorch_core.registerCPUCallBack(self, self._ID)

        # Just return one tensor if it was supposed to be just one.
        if originally_single_tensor:
            return outputs[0]

        return outputs


def identity_loss(x: "torch.Tensor", reduction: "str") -> "torch.Tensor":
    """Marks a tensor as being part of the loss calculation and, as such,
    will back-propagate through it in the PopTorch autograd.

    This function should be called on the (final) loss of a model so that
    it is used as the start of backpropagation. This is equivalent to calling
    ``x.backward()`` on a tensor ``x`` when running on the CPU.

    This function is necessary to combine multiple losses into a custom loss.
    It ensures that the tensor is part of the loss calculation and, as such,
    should be part of the backpropagation in PopTorch autograd.

    Multiple calls to ``identity_loss`` can be made inside the same model
    provided they are all dependant: all marked losses must be traceable
    into a single final tensor itself marked by a call to ``identity_loss``
    otherwise an error is raised.

    :param x: The calculated loss.
    :param reduction: Reduce the loss output as per PyTorch loss
        semantics. Supported values are:

        * ``"sum"``: Sum the losses.
        * ``"mean"``: Take the mean of the losses.
        * ``"none"``: Don't reduce the losses.

    :returns: The loss tensor with the specified reduction applied.
    """
    if reduction == "sum":
        return torch.ops.poptorch.identity_loss(x, 0)

    if reduction == "mean":
        return torch.ops.poptorch.identity_loss(x, 1)

    assert reduction == "none", "Unsupported reduction type!"
    return torch.ops.poptorch.identity_loss(x, 2)


def fps(src: "torch.Tensor",
        ptr: List[int],
        ratio: float = 0.5,
        random_start: bool = False) -> "torch.Tensor":
    """PopTorch implementation of the `torch_cluster` `fps` operator.

    This op is a sampling algorithm from the `"PointNet++: Deep Hierarchical Feature Learning on Point Sets in a Metric Space"
    <https://arxiv.org/abs/1706.02413>`_ paper, and iteratively samples the
    most distant point with regard to the rest points.

    :param src: Point feature matrix.
    :param ptr: Pointer vector which defines ranges of nodes assigned to a
        specific sample.
    :param ratio: The sampling ratio.
    :param random_start: If set to `False`, use the first node in `src` as
        the starting node.
    :returns: A tensor of `src` point indexes.
    """
    if not isinstance(src, torch.Tensor):
        raise _impl.createPoptorchError(
            f"`fps` must take a torch.tensor input. {type(src)} is "
            "not supported.")
    if not isinstance(ptr, list):
        raise _impl.createPoptorchError("`ptr` must be a list of integers.")
    if not len(ptr) >= 2:
        raise _impl.createPoptorchError(
            "`ptr` must containt at least 2 elements.")
    if not isinstance(ratio, float):
        raise _impl.createPoptorchError(
            f"`ratio` must be of float type. {type(ratio)} is not supported.")
    if not isinstance(random_start, bool):
        raise _impl.createPoptorchError(
            f"`random_start` must be of bool type. {type(random_start)} is "
            "not supported.")
    return torch.ops.poptorch.fps(src, ptr, ratio, random_start)


def nearest(x: "torch.Tensor",
            y: "torch.Tensor",
            batch_x: Optional[Union[List[int], "torch.Tensor"]] = None,
            batch_y: Optional[Union[List[int], "torch.Tensor"]] = None):
    """PopTorch implementation of the `torch_cluster` `nearest` operator.

    This op clusters points in `x` together which are nearest to a given query
    point in `y`.

    :param x: Node feature matrix.
    :param y: Node feature matrix.
    :param batch_x: Batch vector, which assigns each node to a specific
        sample. `batch_x` needs to be sorted.
    :param batch_y: Batch vector, which assigns each node to a specific
        sample. `batch_y` needs to be sorted.
    """

    if not isinstance(x, torch.Tensor):
        raise _impl.createPoptorchError(
            f"`nearest` must take a torch.tensor `x` input. {type(x)} is "
            "not supported.")
    if not isinstance(y, torch.Tensor):
        raise _impl.createPoptorchError(
            f"`nearest` must take a torch.tensor `y` input. {type(y)} is "
            "not supported.")

    batch_x = list() if batch_x is None else batch_x
    batch_y = list() if batch_y is None else batch_y

    batch_x_is_list = isinstance(batch_x, list)
    batch_y_is_list = isinstance(batch_y, list)
    batch_x_is_tensor = isinstance(batch_x, torch.Tensor)
    batch_y_is_tensor = isinstance(batch_y, torch.Tensor)

    if batch_x_is_list and batch_y_is_list:
        return torch.ops.poptorch.nearest_batch_list(x, y, batch_x, batch_y)
    if batch_x_is_tensor and batch_y_is_tensor:
        pass
    elif batch_x_is_list and batch_y_is_tensor:
        batch_x = torch.tensor(batch_x, dtype=batch_y.dtype)
    elif batch_x_is_tensor and batch_y_is_list:
        batch_y = torch.tensor(batch_y, dtype=batch_x.dtype)
    else:
        raise _impl.createPoptorchError(
            f"`batch_x` and `batch_y` must be torch.Tensors or lists while "
            f"`batch_x` is of type {type(batch_x)} and `batch_y` is of type "
            f"{type(batch_y)}.")
    return torch.ops.poptorch.nearest(x, y, batch_x, batch_y)


class MultiConv():
    """
    Combines all convolution layers evaluated inside this scope into a single
    multi-convolution.

    Multi-convolutions allow for a set of data-independent convolutions to be
    executed in parallel. Executing convolutions in parallel can lead to an
    increase in the data throughput.

    For example:

    >>> with poptorch.MultiConv():
    ...     y = self.convA(x)
    ...     v = self.convB(u)

    Combines the two data-independent convolutions into a single
    multi-convolution.

    Refer to the PopLibs documentation for further information on
    multi-convolutions.
    """

    def __init__(self):
        self._available_memory_proportions = None
        self._partials_types = None
        self._plan_type = None
        self._per_conv_reserved_tiles = None
        self._cycle_back_off = None
        self._enable_conv_ditherings = None

    @staticmethod
    def _validatePerConvProperty(name, value, expected_scalar_type):
        if value is None:
            return value

        if isinstance(value, expected_scalar_type):
            # Wrap as tuple
            return (value, )

        if isinstance(value, (list, tuple)) and len(value) > 0 and all(
                isinstance(x, expected_scalar_type) for x in value):
            return value

        raise AssertionError(f"Invalid {name}!")

    def availableMemoryProportions(self, value: Union[float, List[float]]
                                   ) -> "poptorch.MultiConv":
        """The available memory proportion per convolution, each [0, 1).

        For more information, please refer to the `technical note
        <https://docs.graphcore.ai/projects/available-memory/en/latest/>`_ on
        optimising temporary memory usage.

        :param value: Can be a ``float`` value in which case the same value is
            used for all of the convolutions. Otherwise, can be a ``tuple`` or
            ``list`` containing as many ``float`` values as the number of
            convolutions.
        :returns: ``self``, to support method chaining.
        """
        name = "available memory proportion"
        value = self._validatePerConvProperty(name, value, float)
        self._available_memory_proportions = value
        return self

    def partialsTypes(self, value: Union[torch.dtype, List[torch.dtype]]
                      ) -> "poptorch.MultiConv":
        """The partials type used for each convolution.

        :param value: Can be a single instance of ``torch.dtype`` in which case
            the same value is used for all of the convolutions. Otherwise, can
            be a ``tuple`` or ``list`` containing as many ``torch.dtype``
            values as the number of convolutions.
        :returns: ``self``, to support method chaining.
        """

        def encode_dtype(dtype):
            if dtype in [torch.float, torch.float32]:
                return 0
            if dtype in [torch.half, torch.float16]:
                return 1
            raise ValueError(
                'Invalid partials types. Expecting torch.float or torch.half')

        if isinstance(value, (list, tuple)):
            value = [encode_dtype(v) for v in value]
        else:
            value = (encode_dtype(value), )

        self._partials_types = value
        return self

    def enableConvDithering(self, value: Union[bool, List[bool]]
                            ) -> "poptorch.MultiConv":
        """Enable per-convolution dithering.

        :param value: Can be a ``bool`` value in which case the same value is
            used for all of the convolutions. Otherwise, can be a ``tuple`` or
            ``list`` containing as many ``bool`` values as the number of
            convolutions.
        :returns: ``self``, to support method chaining.
        """

        if value is None:
            self._enable_conv_ditherings = value
        elif isinstance(value, (list, tuple)):
            for x in value:
                if not isinstance(x, bool):
                    raise ValueError("value must be bool or list of bools")
            self._enable_conv_ditherings = value
        elif isinstance(value, bool):
            self._enable_conv_ditherings = (value, )
        else:
            raise ValueError("value must be bool or list of bools")
        return self

    def planType(self,
                 value: "poptorch.MultiConvPlanType") -> "poptorch.MultiConv":
        """Select the multi-convolution execution strategy.

        :param value: An instance of :py:class:`~poptorch.MultiConvPlanType`.

        :returns: ``self``, to support method chaining.
        """
        if value is None:
            self._plan_type = value
        elif isinstance(value, enums.MultiConvPlanType):
            self._plan_type = value
        else:
            raise AssertionError("Invalid plan type!")

        return self

    def perConvReservedTiles(self, value: int) -> "poptorch.MultiConv":
        """Tiles to reserve for each convolution.

        :param value: Number of tiles.
        :returns: ``self``, to support method chaining.
        """
        assert isinstance(value, int)
        self._per_conv_reserved_tiles = value
        return self

    def cycleBackOff(self, value: float) -> "poptorch.MultiConv":
        """Cycle back off proportion.

        :param value: Number between 0 and 1.
        :returns: ``self``, to support method chaining.
        """
        assert isinstance(value, float)
        self._cycle_back_off = value
        return self

    def __enter__(self):
        torch.ops.poptorch.begin_multi_conv()

    def __exit__(self, type, value, traceback):
        # Convert enums to ints if set
        plan_type = self._plan_type
        if plan_type is not None:
            plan_type = plan_type.value

        torch.ops.poptorch.end_multi_conv(self._available_memory_proportions,
                                          self._partials_types, plan_type,
                                          self._per_conv_reserved_tiles,
                                          self._cycle_back_off,
                                          self._enable_conv_ditherings)


class NameScope:
    """ Create a name scope for a code block. All operators originating
        from this block will have their names prefixed by the given string.

        >>> with poptorch.NameScope("CustomString"):
        ...     y = self.bmm(a, b)
        ...     z = torch.relu(y)
    """

    def __init__(self, name: str):
        assert isinstance(name, str), 'Parameter to NameScope must be a string'
        self.name = name

    def __enter__(self):
        torch.ops.poptorch.push_name_scope(self.name)

    def __exit__(self, type, value, traceback):
        torch.ops.poptorch.pop_name_scope()


================================================
FILE: python/optim.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import math
import inspect
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type
import torch

from ._logging import logger


class VariableAttributes:
    """Track which attributes are variable or constant.

    Is accessible via any PopTorch optimizer via the ``variable_attrs``
    attribute.

    >>> opt = poptorch.optim.SGD(params, lr=0.01)
    >>> opt.variable_attrs.isConstant("lr")
    """

    def __init__(self, variable_attributes: List[str],
                 allowed_attributes: List[str]) -> None:
        """
        :param variable_attributes: list of variable attributes.
        :param allowed_attributes: list of all the attributes.
        """
        self._variable_attributes = variable_attributes
        self._allowed_attributes = allowed_attributes

    def isConstant(self, attr: str) -> bool:
        """Return True if the attribute is marked as constant"""
        return attr not in self._variable_attributes

    def markAsConstant(self, attr: str) -> None:
        """Explicitly mark an attribute as constant"""
        assert attr in self._allowed_attributes, (
            f"Unknown attribute {attr},"
            f" allowed values: {self._allowed_attributes}")
        self._variable_attributes = [
            a for a in self._variable_attributes if a != attr
        ]

    def markAsVariable(self, attr: str) -> None:
        "Explicitly mark an attribute as variable" ""
        assert attr in self._allowed_attributes, (
            f"Unknown attribute {attr},"
            f" allowed values: {self._allowed_attributes}")
        self._variable_attributes.append(attr)


def _parseArgs(all_args: Dict[str, Any],
               child_attrs: Optional[List[str]] = None
               ) -> Tuple[Dict[str, Any], List[str]]:
    child_attrs = child_attrs or []
    args = all_args.copy()
    # Remove special local() variables
    del args["self"]
    # Attributes explicitly set by the user are considered variable
    not_const = [k for k, v in args.items() if v is not None]
    # Filter out the child class attributes
    parent_args = {
        k: v
        for k, v in args.items() if k in not_const and k not in child_attrs
    }
    return parent_args, not_const


class Optimizer:
    def __init__(self):
        self._state_dict = {"ipu_state": None, "ipu_param": None}
        # If True then the state needs to be uploaded to the IPU.
        self.ipu_state_is_dirty = False
        # Once the optimizer has been used on the IPU its state
        # on the host will become dirty.
        self.host_state_is_dirty = False

    # These functions must be overridden so that the optimiser state can be set
    # when the model is created
    def state_dict(self):
        return self.get_state_dict()

    def load_state_dict(self, state):
        # We also need to load torch's state dict so that LR schedulers work
        torch.optim.Optimizer.load_state_dict(self, state)
        self.set_state_dict(state)

    # Getter/setter for local state dict after the above functions been overridden by PoplarExecutor
    def get_state_dict(self):
        # Return both the internal state dict and torch's state dict
        # so that LR schedulers work
        return {**self._state_dict, **torch.optim.Optimizer.state_dict(self)}

    def set_state_dict(self, state):
        if not state:
            raise RuntimeError(
                "Cannot load optimizer state dictionary because it is empty.")
        if not ("ipu_state" in state and "ipu_param" in state):
            raise RuntimeError(
                "Only IPU optimizer states can be loaded onto the IPU.")
        self._state_dict = state
        self.ipu_state_is_dirty = True
        self.host_state_is_dirty = False

    def has_state(self):
        return (self._state_dict.get("ipu_state") is not None
                and self._state_dict.get("ipu_param") is not None)


class SGD(Optimizer, torch.optim.SGD):
    # pylint: disable=line-too-long
    """ Stochastic gradient descent with optional momentum.

    The optimizer is based on PyTorch's implementation
    (`torch.optim.SGD <https://pytorch.org/docs/1.10.0/optim.html#torch.optim.SGD>`_)
    with optional loss and velocity scaling.

    PopTorch provides two possible variants. Both variants are mathematically
    identical to PyTorch but differ in their stability and efficiency.

    .. note:: If you set momentum to zero and do not use gradient accumulation,
      PopTorch will use a simple SGD variant and ignore the values of
      ``use_combined_accum``, ``accum_type`` and ``velocity_accum_type``.

    **Separate tensor variant (default)**

    If you set ``use_combined_accum`` to ``False`` (default), you will use a
    more stable but more memory intensive variant. In this case, PopTorch keeps
    two state tensors for each weight: one for gradient accumulation and one for
    velocity. It operates as follows when training:

    #. PopTorch runs one or more forward/backwards steps, equal the number of
       gradient accumulations (see
       :py:func:`~poptorch.options._TrainingOptions.gradientAccumulation`).
       Each time PopTorch sums the gradients, storing them in accumulators.
    #. Once all the forward and backwards have completed, PopTorch uses the
       summed gradients to update the velocities. At this stage, PopTorch will
       correct the scale based on the setting of
       :py:func:`~poptorch.options._TrainingOptions.accumulationAndReplicationReductionType`.
       PopTorch stores the velocities as optimiser states.
    #. Finally, PopTorch uses the velocities to update the parameters, taking
       into account the loss scaling and learning rate.

    With ``use_combined_accum`` set to False, you can independently change the
    data type used for storing the accumulated gradients and the velocity
    values using ``accum_type`` and ``velocity_accum_type``, respectively.

    Velocity scaling is ignored for this variant.

    .. note:: If the number of gradient accumulations is high, you can use off
        chip memory for the velocity tensors with a minimal performance hit.

        >>> opts.TensorLocations.setOptimizerLocation(
        ...     poptorch.TensorLocationSettings().useOnChipStorage(False))

    **Combined tensor variant**

    If you set `use_combined_accum`` to ``True``, you will use a less stable but
    more memory efficient variant. In this case PopTorch uses a single tensor
    (the combined tensor) for gradient accumulation and velocity.
    It operates as follows when training:

    #. PopTorch runs one or more forward/backwards steps equal the number of
       gradient accumulations (see
       :py:func:`~poptorch.options._TrainingOptions.gradientAccumulation`).
       For each step, PopTorch immediately calculates an increment or decrement
       for the combined tensors for each parameter. The amount of increment or
       decrement takes into account the setting of
       :py:func:`~poptorch.options._TrainingOptions.accumulationAndReplicationReductionType`.
       as well as removing loss scaling and introducing any velocity scaling.
    #. After running all the steps, the combined tensor will be be equal to the
       new velocities. PopTorch uses these to update the parameters taking
       into account the velocity scaling and learning rate.

    PopTorch ignores the `accum_type`` and ``velocity_accum_type`` values when
    using a combined tensor. In addition, there are no optimizer state tensors
    and so ``opts.TensorLocations.setOptimizerLocation`` has no effect.

    .. warning:: For both variants, reducing the velocity scaling during
        training will result in temporary over-estimation of the velocity and
        could cause model instability. Increasing the scaling may temporarily
        slow model convergence but not lead to instability.
    """
    # Variables which don't exist in the parent optimizer class and are
    # global (Cannot be set per group).
    _child_vars = ["loss_scaling"]
    # All the attributes and variables which don't exist in the parent optimizer class.
    _child_only = _child_vars + [
        "velocity_scaling", "use_combined_accum", "accum_type",
        "velocity_accum_type", "max_grad_norm"
    ]
    # Attributes (from the parent or child class) which can be set per group.
    _group_vars = [
        "lr", "momentum", "dampening", "weight_decay", "nesterov",
        "velocity_scaling"
    ]

    def __init__(self,
                 params: Iterable,
                 lr: float,
                 momentum: Optional[float] = None,
                 dampening: Optional[float] = None,
                 weight_decay: Optional[float] = None,
                 nesterov: Optional[bool] = None,
                 maximize: Optional[bool] = None,
                 foreach: Optional[bool] = None,
                 differentiable: Optional[bool] = None,
                 loss_scaling: Optional[float] = None,
                 velocity_scaling: Optional[float] = None,
                 use_combined_accum: Optional[bool] = None,
                 accum_type: Optional[torch.dtype] = None,
                 velocity_accum_type: Optional[torch.dtype] = None,
                 max_grad_norm: Optional[float] = None) -> None:
        """
        :param iterable params: parameters to optimize.
        :param lr: learning rate.
        :param momentum: momentum factor.
        :param dampening: dampening term for momentum.
        :param weight_decay: Weight decay (L2 penalty) factor.
        :param nesterov: Whether to enable Nesterov momentum. Default is
            `False`.
        :param loss_scaling: Factor by which to scale the loss and hence
            gradients to assist numerical stability when using float16.
        :param velocity_scaling: Factor by which to scale the velocity values
            to assist numerical stability when using float16. (This applies to
            the combined variant only.)
        :param use_combined_accum: Whether to use a combined accumulator.
        :param accum_type: data type used for gradients.
        :param velocity_accum_type: data type used to store
            the velocity values for each parameter.
        :param max_grad_norm: Maximum norm of gradients. Default is `inf`.
        """
        # pylint: disable=unused-argument
        # Call to locals() must be at the very top of  __init__
        parent_args, variables = _parseArgs(locals(), SGD._child_only)
        Optimizer.__init__(self)
        torch.optim.SGD.__init__(self, **parent_args)

        # Loss scaling is a global setting: store it as an attribute
        if loss_scaling is None:
            loss_scaling = 1.0

        if use_combined_accum is None:
            use_combined_accum = False
        self.use_combined_accum = use_combined_accum

        if accum_type is None:
            accum_type = torch.float32
        if velocity_accum_type is None:
            velocity_accum_type = torch.float32

        self.loss_scaling = loss_scaling

        # Velocity scaling can be set per group: register it in defaults
        # and update the existing groups.
        if velocity_scaling is None:
            velocity_scaling = 1.0
            # NB this will be overridden to loss_scaling in the case of the
            # separate tensor variant.
        else:
            if not use_combined_accum:
                logger.warning("velocity_scaling value ignored when "
                               "using the separate variant "
                               "(use_combined_accum=False). In future, this "
                               "will lead to an error. Please update your "
                               "code.")

        if use_combined_accum:
            self.defaults["velocity_scaling"] = velocity_scaling
            for group in self.param_groups:
                group.setdefault("velocity_scaling", velocity_scaling)

        if nesterov is None:
            nesterov = False

        supportedTypes = [torch.float16, torch.float32]
        errString = ("Accumulation types must be either torch.float32"
                     " or torch.float16")
        assert accum_type in supportedTypes, errString
        self.accum_type = accum_type

        assert velocity_accum_type in supportedTypes, errString
        self.velocity_accum_type = velocity_accum_type
        if max_grad_norm is None:
            max_grad_norm = float("Inf")
        self.max_grad_norm = max_grad_norm

        self.variable_attrs = VariableAttributes(
            variables,
            list(self.defaults) + SGD._child_vars)

    def __getstate__(self) -> Dict[str, Any]:
        state = torch.optim.SGD.__getstate__(self)
        # Manually save the attributes
        # (groups / defaults are saved by the parent)
        state["variable_attrs"] = self.variable_attrs
        state["loss_scaling"] = self.loss_scaling
        state["use_combined_accum"] = self.use_combined_accum
        state["accum_type"] = self.accum_type
        state["velocity_accum_type"] = self.velocity_accum_type
        state["max_grad_norm"] = self.max_grad_norm

        # Mark the state as dirty only if there is one.
        state["_state_dict"] = self._state_dict
        state["ipu_state_is_dirty"] = self.has_state()
        state["host_state_is_dirty"] = False
        return state


class Adam(Optimizer, torch.optim.Adam):
    """ Adam optimizer.

    This optimizer matches PyTorch's implementation
    (`torch.optim.Adam <https://pytorch.org/docs/1.10.0/optim.html#torch.optim.Adam>`_) with
    optional loss scaling.

    AMSGrad is currently not supported."""

    # Variables which don't exist in the parent optimizer class and are
    # global (Cannot be set per group).
    _child_vars = ["loss_scaling"]
    # All the attributes and variables which don't exist in the parent optimizer class.
    _child_only = _child_vars + [
        "accum_type", "first_order_momentum_accum_type",
        "second_order_momentum_accum_type", "max_grad_norm"
    ]
    # Attributes (from the parent or child class) which can be set per group.
    _group_vars = ["lr", "betas", "eps", "weight_decay", "amsgrad"]

    def __init__(
            self,
            params: Iterable,
            lr: Optional[float] = None,
            betas: Optional[Tuple[float, float]] = None,
            eps: Optional[float] = None,
            weight_decay: Optional[float] = None,
            amsgrad: Optional[bool] = None,
            foreach: Optional[bool] = None,
            maximize: Optional[bool] = None,
            capturable: Optional[bool] = None,
            differentiable: Optional[bool] = None,
            fused: Optional[bool] = None,
            loss_scaling: Optional[float] = None,
            accum_type: Optional[torch.dtype] = None,
            first_order_momentum_accum_type: Optional[torch.dtype] = None,
            second_order_momentum_accum_type: Optional[torch.dtype] = None,
            max_grad_norm: Optional[float] = None) -> None:
        """
        :param iterable params: parameters to optimize.
        :param lr: learning rate
        :param betas: ``(beta1, beta2)`` parameters used in Adam.
        :param eps: term added to the denominator to ensure numerical stability.
        :param weight_decay: Weight decay factor.
        :param amsgrad: Not supported (must be False).
        :param loss_scaling: Factor by which to scale the loss and hence
            gradients to assist numerical stability when using float16.
        :param accum_type: data type used for gradients.
        :param first_order_momentum_accum_type: data type used to store
            the first order momentum values for each parameter.
        :param second_order_momentum_accum_type: data type used to store
            the second order momentum values for each parameter.
        :param max_grad_norm: Maximum norm of gradients. Default is `inf`.
        """
        # pylint: disable=unused-argument
        # Call to locals() must be at the very top of  __init__
        parent_args, variables = _parseArgs(locals(), Adam._child_only)
        Optimizer.__init__(self)
        torch.optim.Adam.__init__(self, **parent_args)

        if loss_scaling is None:
            loss_scaling = 1.0
        if accum_type is None:
            accum_type = torch.float32
        if first_order_momentum_accum_type is None:
            first_order_momentum_accum_type = torch.float32
        if second_order_momentum_accum_type is None:
            second_order_momentum_accum_type = torch.float32
        if max_grad_norm is None:
            max_grad_norm = float("Inf")

        # All the child attributes are global: store them as
        # attributes.
        self.loss_scaling = loss_scaling

        supportedTypes = [torch.float16, torch.float32]
        errString = ("Accumulation types must be either torch.float32"
                     " or torch.float16")
        assert accum_type in supportedTypes, errString
        self.accum_type = accum_type

        assert first_order_momentum_accum_type in supportedTypes, errString
        self.first_order_momentum_accum_type = \
             first_order_momentum_accum_type

        assert second_order_momentum_accum_type in supportedTypes, errString
        self.second_order_momentum_accum_type = \
             second_order_momentum_accum_type

        self.max_grad_norm = max_grad_norm

        self.variable_attrs = VariableAttributes(
            variables,
            list(self.defaults) + Adam._child_vars)

    def __getstate__(self) -> Dict[str, Any]:
        state = torch.optim.Adam.__getstate__(self)
        # Manually save the attributes
        # (groups / defaults are saved by the parent)
        state["variable_attrs"] = self.variable_attrs
        state["loss_scaling"] = self.loss_scaling
        state["accum_type"] = self.accum_type
        state["first_order_momentum_accum_type"] = \
                self.first_order_momentum_accum_type
        state["second_order_momentum_accum_type"] = \
                self.second_order_momentum_accum_type
        state["max_grad_norm"] = self.max_grad_norm

        # Mark the state as dirty only if there is one.
        state["_state_dict"] = self._state_dict
        state["ipu_state_is_dirty"] = self.has_state()
        state["host_state_is_dirty"] = False
        return state


class AdamW(Optimizer, torch.optim.AdamW):
    """ Adam optimizer with true weight decay.

    This optimizer matches PyTorch's implementation
    (`torch.optim.AdamW <https://pytorch.org/docs/1.10.0/optim.html#torch.optim.AdamW>`_)
    with optional loss scaling.

    AMSGrad is currently not supported."""

    # Variables which don't exist in the parent optimizer class and are
    # global (Cannot be set per group).
    _child_vars = ["loss_scaling"]
    # All the attributes and variables which don't exist in the parent optimizer class.
    _child_only = _child_vars + [
        "bias_correction",
        "accum_type",
        "first_order_momentum_accum_type",
        "second_order_momentum_accum_type",
        "max_grad_norm",
    ]
    # Attributes (from the parent or child class) which can be set per group.
    _group_vars = ["lr", "betas", "weight_decay", "eps", "amsgrad"]

    def __init__(
            self,
            params: Iterable,
            lr: Optional[float] = None,
            betas: Optional[Tuple[float, float]] = None,
            eps: Optional[float] = None,
            weight_decay: Optional[float] = None,
            amsgrad: Optional[bool] = None,
            maximize: Optional[bool] = None,
            foreach: Optional[bool] = None,
            capturable: Optional[bool] = None,
            differentiable: Optional[bool] = None,
            fused: Optional[bool] = None,
            loss_scaling: Optional[float] = None,
            bias_correction: Optional[bool] = None,
            accum_type: Optional[torch.dtype] = None,
            first_order_momentum_accum_type: Optional[torch.dtype] = None,
            second_order_momentum_accum_type: Optional[torch.dtype] = None,
            max_grad_norm: Optional[float] = None) -> None:
        """
        :param iterable params: parameters to optimize.
        :param lr: learning rate
        :param betas: ``(beta1, beta2)`` parameters used in AdamW.
        :param eps: term added to the denominator to ensure numerical stability.
        :param weight_decay: Weight decay factor.
        :param amsgrad: Not supported (must be False).
        :param loss_scaling: Factor by which to scale the loss and hence
            gradients to assist numerical stability when using float16.
        :param bias_correction: True: compute Adam with bias correction.
        :param accum_type: data type used for gradients.
        :param first_order_momentum_accum_type: data type used to store
            the first order momentum values for each parameter.
        :param second_order_momentum_accum_type: data type used to store
            the second order momentum values for each parameter.
        :param max_grad_norm: Maximum norm of gradients. Default is `inf`.
        """
        # pylint: disable=unused-argument
        # Call to locals() must be at the very top of  __init__
        parent_args, variables = _parseArgs(locals(), AdamW._child_only)
        Optimizer.__init__(self)
        torch.optim.AdamW.__init__(self, **parent_args)

        if loss_scaling is None:
            loss_scaling = 1.0
        if bias_correction is None:
            bias_correction = True
        if accum_type is None:
            accum_type = torch.float32
        if first_order_momentum_accum_type is None:
            first_order_momentum_accum_type = torch.float32
        if second_order_momentum_accum_type is None:
            second_order_momentum_accum_type = torch.float32
        if max_grad_norm is None:
            max_grad_norm = float("Inf")

        self.loss_scaling = loss_scaling
        self.bias_correction = bias_correction

        supportedTypes = [torch.float16, torch.float32]
        errString = ("Accumulation types must be either torch.float32"
                     " or torch.float16")
        assert accum_type in supportedTypes, errString
        self.accum_type = accum_type

        assert first_order_momentum_accum_type in supportedTypes, errString
        self.first_order_momentum_accum_type = \
             first_order_momentum_accum_type

        assert second_order_momentum_accum_type in supportedTypes, errString
        self.second_order_momentum_accum_type = \
             second_order_momentum_accum_type

        self.max_grad_norm = max_grad_norm

        self.variable_attrs = VariableAttributes(
            variables,
            list(self.defaults) + AdamW._child_vars)

    def __getstate__(self) -> Dict[str, Any]:
        state = torch.optim.AdamW.__getstate__(self)
        # Manually save the attributes
        # (groups / defaults are saved by the parent)
        state["variable_attrs"] = self.variable_attrs
        state["loss_scaling"] = self.loss_scaling
        state["bias_correction"] = self.bias_correction
        state["accum_type"] = self.accum_type
        state["first_order_momentum_accum_type"] = \
                self.first_order_momentum_accum_type
        state["second_order_momentum_accum_type"] = \
                self.second_order_momentum_accum_type
        state["max_grad_norm"] = self.max_grad_norm

        # Mark the state as dirty only if there is one.
        state["_state_dict"] = self._state_dict
        state["ipu_state_is_dirty"] = self.has_state()
        state["host_state_is_dirty"] = False
        return state


class RMSprop(Optimizer, torch.optim.RMSprop):
    """ RMSprop optimizer with optional L2 penalty.

    This optimizer matches PyTorch's implementation (
    `torch.optim.RMSprop <https://pytorch.org/docs/1.10.0/optim.html#torch.optim.RMSprop>`_)
    with optional loss scaling.

    However, if the use_tf_variant flag is set to True, it will instead match
    the TensorFlow implementation which differs from PyTorch's implementation
    in three ways:
    1) The average squared gradients buffer is initialized to ones.
    2) The small epsilon constant is applied inside the square root.
    3) Learning rate is accumulated in the momentum buffer if momentum is used."""

    # Variables which don't exist in the parent optimizer class and are
    # global (Cannot be set per group).
    _child_vars = ["loss_scaling"]
    # All the attributes and variables which don't exist in the parent optimizer class.
    _child_only = _child_vars + [
        "accum_type", "first_order_momentum_accum_type",
        "second_order_momentum_accum_type", "use_tf_variant"
    ]
    # Attributes (from the parent or child class) which can be set per group.
    _group_vars = [
        "lr", "momentum", "weight_decay", "alpha", "eps", "centered"
    ]

    def __init__(
            self,
            params: Iterable,
            lr: Optional[float] = None,
            alpha: Optional[float] = None,
            eps: Optional[float] = None,
            weight_decay: Optional[float] = None,
            momentum: Optional[float] = None,
            centered: Optional[bool] = None,
            foreach: Optional[bool] = None,
            maximize: Optional[bool] = None,
            differentiable: Optional[bool] = None,
            loss_scaling: Optional[float] = None,
            accum_type: Optional[torch.dtype] = None,
            first_order_momentum_accum_type: Optional[torch.dtype] = None,
            second_order_momentum_accum_type: Optional[torch.dtype] = None,
            use_tf_variant: Optional[bool] = None) -> None:
        """
        :param iterable params: parameters to optimize.
        :param lr: learning rate.
        :param alpha: smoothing constant.
        :param eps: term added to the denominator to ensure numerical
           stability.
        :param weight_decay: L2 penalty coefficient.
        :param momentum: momentum factor.
        :param centered: True: compute centred RMSprop in which the
            gradient is normalized by an estimate of its variance.
        :param loss_scaling: Factor by which to scale the loss and hence
            gradients to assist numerical stability when using float16.
        :param accum_type: data type used for gradients.
        :param first_order_momentum_accum_type: data type used to store
            the first order momentum values for each parameter.
        :param second_order_momentum_accum_type: data type used to store
            the second order momentum values for each parameter.
        :param use_tf_variant: False: If True, use the TensorFlow variant
            of RMSProp.
        """
        # pylint: disable=unused-argument
        # Call to locals() must be at the very top of  __init__
        parent_args, variables = _parseArgs(locals(), RMSprop._child_only)
        Optimizer.__init__(self)
        torch.optim.RMSprop.__init__(self, **parent_args)

        if loss_scaling is None:
            loss_scaling = 1.0
        if accum_type is None:
            accum_type = torch.float32
        if first_order_momentum_accum_type is None:
            first_order_momentum_accum_type = torch.float32
        if second_order_momentum_accum_type is None:
            second_order_momentum_accum_type = torch.float32
        if use_tf_variant is None:
            use_tf_variant = False

        self.loss_scaling = loss_scaling

        supportedTypes = [torch.float16, torch.float32]
        errString = ("Accumulation types must be either torch.float32"
                     " or torch.float16")
        assert accum_type in supportedTypes, errString
        self.accum_type = accum_type

        assert first_order_momentum_accum_type in supportedTypes, errString
        self.first_order_momentum_accum_type = \
             first_order_momentum_accum_type

        assert second_order_momentum_accum_type in supportedTypes, errString
        self.second_order_momentum_accum_type = \
             second_order_momentum_accum_type
        self.use_tf_variant = use_tf_variant
        self.variable_attrs = VariableAttributes(
            variables,
            list(self.defaults) + RMSprop._child_vars)

    def __getstate__(self) -> Dict[str, Any]:
        state = torch.optim.RMSprop.__getstate__(self)
        # Manually save the attributes
        # (groups / defaults are saved by the parent)
        state["variable_attrs"] = self.variable_attrs
        state["loss_scaling"] = self.loss_scaling
        state["accum_type"] = self.accum_type
        state["first_order_momentum_accum_type"] = \
                self.first_order_momentum_accum_type
        state["second_order_momentum_accum_type"] = \
                self.second_order_momentum_accum_type
        state["use_tf_variant"] = self.use_tf_variant

        # Mark the state as dirty only if there is one.
        state["_state_dict"] = self._state_dict
        state["ipu_state_is_dirty"] = self.has_state()
        state["host_state_is_dirty"] = False
        return state


class LAMB(Optimizer, torch.optim.Optimizer):
    """ Layer-wise Adaptive Moments (LAMB) optimizer (biased version).

        Based on "Large Batch Optimization for Deep Learning: Training BERT
        in 76 minutes" (https://arxiv.org/abs/1904.00962).

        The scaling function phi(z) is fixed as min(z, max_weight_norm);
    """
    # Variables which don't exist in the parent optimizer class and are
    # global (Cannot be set per group).
    _child_vars = ["loss_scaling"]
    # All the attributes and variables which don't exist in the parent optimizer class.
    _child_only = _child_vars + [
        "bias_correction", "accum_type", "first_order_momentum_accum_type",
        "second_order_momentum_accum_type"
    ]
    # Attributes (from the parent or child class) which can be set per group.
    _group_vars = ["lr", "weight_decay", "betas", "eps", "max_weight_norm"]

    def __init__(self,
                 params: Iterable,
                 lr: Optional[float] = None,
                 betas: Tuple[float, float] = None,
                 eps: Optional[float] = None,
                 weight_decay: Optional[float] = None,
                 bias_correction: Optional[bool] = None,
                 loss_scaling: Optional[float] = None,
                 max_weight_norm: Optional[float] = None,
                 accum_type: Optional[torch.dtype] = None,
                 first_order_momentum_accum_type: Optional[torch.dtype] = None,
                 second_order_momentum_accum_type: Optional[torch.dtype] = None
                 ) -> None:
        """
        :param iterable params: parameters to optimize.
        :param lr: learning rate
        :param betas: ``(beta1, beta2)`` parameters used in LAMB.
        :param eps: term added to the denominator to ensure numerical
           stability/
        :param weight_decay: weight decay factor.
        :param bias_correction: True: compute LAMB with bias correction.
        :param loss_scaling: Factor by which to scale the loss and hence
            gradients to assist numerical stability when using float16.
        :param max_weight_norm: maximum value of the output of scaling
            function, phi(). Set to None to disable scaling function.
        :param accum_type: data type used for gradients.
        :param first_order_momentum_accum_type: data type used to store
            the first order momentum values for each parameter.
        :param second_order_momentum_accum_type: data type used to store
           the second order momentum values for each parameter.
        """
        # pylint: disable=unused-argument
        # Call to locals() must be at the very top of  __init__
        _, variables = _parseArgs(locals(), [])
        if max_weight_norm is None:
            max_weight_norm = 65500.0  # FP16 Max
        if lr is None:
            lr = 1e-3
        if betas is None:
            betas = (0.9, 0.999)
        if eps is None:
            eps = 1e-8
        if weight_decay is None:
            weight_decay = 1e-2
        if bias_correction is None:
            bias_correction = True
        if loss_scaling is None:
            loss_scaling = 1.0
        if accum_type is None:
            accum_type = torch.float32
        if first_order_momentum_accum_type is None:
            first_order_momentum_accum_type = torch.float32
        if second_order_momentum_accum_type is None:
            second_order_momentum_accum_type = torch.float32
        defaults = dict(lr=lr,
                        betas=betas,
                        eps=eps,
                        weight_decay=weight_decay,
                        max_weight_norm=max_weight_norm)
        Optimizer.__init__(self)
        torch.optim.Optimizer.__init__(self, params, defaults)

        supportedTypes = [torch.float16, torch.float32]
        errString = """Accumulation types must be either torch.float32
                or torch.float16"""
        assert accum_type in supportedTypes, errString
        assert first_order_momentum_accum_type in supportedTypes, errString
        assert second_order_momentum_accum_type in supportedTypes, errString

        self.bias_correction = bias_correction
        self.loss_scaling = loss_scaling
        self.max_weight_norm = max_weight_norm
        self.accum_type = accum_type
        self.first_order_momentum_accum_type = \
             first_order_momentum_accum_type
        self.second_order_momentum_accum_type = \
             second_order_momentum_accum_type

        self.variable_attrs = VariableAttributes(
            variables,
            list(self.defaults) + LAMB._child_vars)

    def step(self, closure: Optional[Callable] = None) -> Optional[float]:
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group["params"]:
                if p.grad is None:
                    continue

                grad = p.grad.data
                state = self.state[p]
                if len(state) == 0:
                    state["step"] = 0
                    state["exp_avg"] = torch.zeros_like(p.data)
                    state["exp_avg_sq"] = torch.zeros_like(p.data)

                state["step"] += 1

                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]

                beta1, beta2 = group["betas"]

                if self.bias_correction:
                    bias_correction1 = 1 - beta1**state["step"]
                    bias_correction2 = 1 - beta2**state["step"]
                else:
                    bias_correction1 = 1
                    bias_correction2 = 1

                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)

                denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(
                    group["eps"])

                upd = ((exp_avg / bias_correction1) /
                       denom) + group["weight_decay"] * p.data

                r1 = p.data.pow(2).sum().sqrt()
                r2 = upd.pow(2).sum().sqrt()

                r1_ = r1.clamp(max=self.max_weight_norm)

                if r1_ == 0 or r2 == 0:
                    trust = 1.0
                else:
                    trust = r1_ / r2

                p.data.add_(upd, alpha=-group['lr'] * trust)

        return loss

    def __getstate__(self) -> Dict[str, Any]:
        state = torch.optim.Optimizer.__getstate__(self)
        # Manually save the attributes
        # (groups / defaults are saved by the parent)
        state["variable_attrs"] = self.variable_attrs
        state["loss_scaling"] = self.loss_scaling
        state["bias_correction"] = self.bias_correction
        state["accum_type"] = self.accum_type
        state["first_order_momentum_accum_type"] = \
                self.first_order_momentum_accum_type
        state["second_order_momentum_accum_type"] = \
                self.second_order_momentum_accum_type

        # Mark the state as dirty only if there is one.
        state["_state_dict"] = self._state_dict
        state["ipu_state_is_dirty"] = self.has_state()
        state["host_state_is_dirty"] = False
        return state


def _check_constructor_match_parent(child_class: Type[torch.optim.Optimizer]
                                    ) -> None:
    parent = child_class.__bases__[1]
    parent_params = inspect.signature(parent.__init__).parameters
    child_params = inspect.signature(child_class.__init__).parameters
    extra_args = child_class._child_only  # pylint: disable=protected-access
    assert len(parent_params) + len(extra_args) == len(child_params), (
        f"Expected {len(parent_params) + len(extra_args)} parameters but got "
        f"{len(child_params)}")

    child_params = iter(child_params.items())
    for idx, (_, param) in enumerate(parent_params.items()):
        _, child_param = next(child_params)
        assert child_param.name == param.name, (
            f"Mismatch for parameter {idx}: expected"
            f"'{param}' but got '{child_param}'")

    for extra_arg in extra_args:
        name, _ = next(child_params)
        assert name == extra_arg, (f"Expected an extra argument named "
                                   f"'{extra_arg}' but got '{name}'")


_check_constructor_match_parent(SGD)
_check_constructor_match_parent(Adam)
_check_constructor_match_parent(AdamW)
_check_constructor_match_parent(RMSprop)


================================================
FILE: python/options.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import os
import json
import copy
from typing import Optional, Union, Dict, Any, List, Set
import torch
from . import enums
from ._logging import logger
from . import _options_config
from . import _options_impl
from . import ops


class Attribute():
    _current_attrs = {}

    def __init__(self, **kwargs):
        self._kwargs = kwargs
        self._saved = {}

    def __enter__(self):
        self._saved = copy.deepcopy(Attribute._current_attrs)
        for attr, dictionary in self._kwargs.items():
            for k, v in dictionary.items():
                torch.ops.poptorch.set_attribute(attr, k, v)
            if attr in Attribute._current_attrs:
                Attribute._current_attrs[attr].update(dictionary)
            else:
                Attribute._current_attrs[attr] = dictionary

    def __exit__(self, type, value, traceback):
        for attr, dictionary in self._kwargs.items():
            saved_dict = self._saved.get(attr, {})
            for k in dictionary.keys():
                if k not in saved_dict:
                    torch.ops.poptorch.clear_attribute(attr, k)
                else:
                    torch.ops.poptorch.set_attribute(attr, k, saved_dict[k])
        Attribute._current_attrs = self._saved


# Used by _options_config, defined here so that it is reported
# to the user as a "poptorch.options.ConfigFileError"
class ConfigFileError(Exception):
    pass


class _JitOptions(_options_impl.OptionsDict):
    """Options related to PyTorch's JIT compiler.

    Can be accessed via :py:attr:`poptorch.Options.Jit`:

    >>> opts = poptorch.Options()
    >>> opts.Jit.traceModel(True)
    """

    def traceModel(self, trace_model: bool) -> "poptorch.options._JitOptions":
        """
        DO NOT USE: about to be removed.
        """
        logger.warning("[Deprecated] Do not call options.Jit.traceModel(): "
                       "options.Jit.traceModel(False) is now the default, "
                       "and True is no longer supported, therefore this "
                       "function will be removed shortly")
        if trace_model:
            raise ValueError(
                "options.Jit.traceModel(True) is no longer supported")
        return self


class _PrecisionOptions(_options_impl.OptionsDict):
    """ Options related to processing the PyTorch JIT graph prior to lowering to
    PopART

    Can be accessed via :py:attr:`poptorch.Options.Precision`:

    >>> opts = poptorch.Options()
    >>> opts.Precision.enableFloatingPointExceptions(True)
    """

    def __init__(self,
                 popart_options: "poptorch.options._PopartOptions") -> None:
        self._popart_options = popart_options
        super().__init__()

    def halfFloatCasting(
            self,
            half_float_casting: "poptorch.HalfFloatCastingBehavior"  # pylint: disable=unused-argument
    ) -> "poptorch.options._PrecisionOptions":
        """
        DO NOT USE: about to be removed.
        """
        logger.warning("[Deprecated] Do not call "
                       "options.Precision.halfFloatCasting(): "
                       "HalfUpcastToFloat is now the only supported option "
                       "and matches PyTorch's behaviour so you don't need "
                       "to explicitly set it.")

        return self

    def runningStatisticsAlwaysFloat(self, value: bool
                                     ) -> "poptorch.options._PrecisionOptions":
        """
        DO NOT USE: about to be removed.
        """

        if not isinstance(value, bool):
            raise ValueError(
                "runningStatisticsAlwaysFloat needs to be set to a bool")

        logger.warning("[Deprecated] Do not call "
                       "options.Precision.runningStatisticsAlwaysFloat(): "
                       "False is now the only supported option "
                       "and matches PyTorch's behaviour so you don't need "
                       "to explicitly set it.")
        return self

    def enableFloatingPointExceptions(
            self, enabled: bool) -> "poptorch.options._PrecisionOptions":
        """Set whether floating point exceptions are enabled on the IPU.

        When enabled, an exception will be generated when the IPU encounters
        any one of the following:

        * Operation resulting in subtraction of infinities
        * Divisions by zero or by infinity
        * Multiplications between zero and infinity
        * Real operations producing complex results
        * Comparison where any one operand is Not-a-Number

       :param enabled:
           * True: raise ``RuntimeError`` on floating point exception
           * False: do not raise ``RuntimeError`` (default)
        """

        assert isinstance(enabled, bool), \
            "enableFloatingPointExceptions needs to be set to a bool"

        self._popart_options.set("enableFloatingPointChecks", enabled)
        return self

    def enableStochasticRounding(self, enabled: bool
                                 ) -> "poptorch.options._PrecisionOptions":
        """Set whether stochastic rounding is enabled on the IPU.

        Stochastic rounding rounds up or down a values to half (float16)
        randomly such that that the expected (mean) result of rounded value is
        equal to the unrounded value. It can improve training performance by
        simulating higher precision behaviour and increasing the speed or
        likelihood of model convergence. However, the model is non-deterministic
        and represents a departure from (deterministic) standard IEEE FP16
        behaviour.

        In the general case, we recommend enabling stochastic rounding for
        training where convergence is desirable, but not for inference where
        non-determinism may be undesirable.

        :param enabled:
            * True: Enable stochastic rounding on the IPU.
            * False: Disable stochastic rounding.
        """
        self._popart_options.set("enableStochasticRounding", enabled)
        return self

    def setPartialsType(self, dtype: torch.dtype
                        ) -> "poptorch.options._PrecisionOptions":
        """Set the data type of partial results for matrix multiplication and
        convolution operators.

        The matrix multiplication and convolution operators store intermediate
        results known as partials as part of the calculation. You can use this
        option to change the data type of the partials. Using ``torch.half``
        reduces on-chip memory use at the cost of precision.


        :param torch.dtype type:
            The type to store partials, which must be either ``torch.float`` or
            ``torch.half``
        """

        type_str = ''
        if dtype in [torch.float, torch.float32]:
            type_str = 'float'
        elif dtype in [torch.half, torch.float16]:
            type_str = 'half'
        else:
            raise ValueError("parameter to setPartialsType should be either" \
                             "torch.float or torch.half")

        self._popart_options.set("partialsTypeMatMuls", type_str)
        self._popart_options.set("convolutionOptions",
                                 {"partialsType": type_str})
        return self


class _TrainingOptions(_options_impl.OptionsDict):
    """Options specific to model training.

    .. note:: You must not set these options for inference models.

    Can be accessed via :py:attr:`poptorch.Options.Training`:

    >>> opts = poptorch.Options()
    >>> opts.Training.gradientAccumulation(4)
    """

    def __init__(self,
                 popart_options: "poptorch.options._PopartOptions") -> None:
        self._popart_options = popart_options
        super().__init__(gradient_accumulation=1,
                         accumulation_and_replication_reduction_type=enums.
                         ReductionType.Mean,
                         meanAccumulationAndReplicationReductionStrategy=enums.
                         MeanReductionStrategy.Post)

    def gradientAccumulation(self, gradient_accumulation: int
                             ) -> "poptorch.options._TrainingOptions":
        """Number of micro-batches to accumulate for the gradient calculation.

        Accumulate the gradient ``gradient_accumulation`` times before updating
        the model using the gradient. Other frameworks may refer to this setting
        as "pipeline depth".

        Accumulate the gradient ``gradient_accumulation`` times before updating
        the model using the gradient. Each micro-batch (a batch of size equal to
        the  ``batch_size`` argument passed to
        :py:class:`~poptorch.DataLoader`) corresponds to one gradient
        accumulation. Therefore ``gradient_accumulation`` scales the global
        batch size (number of samples between optimiser updates).

        .. note:: Increasing ``gradient_accumulation`` does not alter the
            (micro-)batch size used for batch normalisation.

        A large value for ``gradient_accumulation`` can improve training
        throughput by amortising optimiser update costs, most notably when using
        :py:class:`~poptorch.PipelinedExecution` or when training is distributed
        over a number of replicas. However, the consequential increase in the
        number of samples between optimiser updates can have an adverse impact
        on training.

        The reason why the efficiency gains are most notable when training with
        models with multiple IPUs which express pipelined model parallelism
        (via :py:class:`~poptorch.PipelinedExecution` or by default and
        annotating the model :py:class:`~poptorch.BeginBlock` or
        :py:class:`~poptorch.Block`) is because the pipeline has "ramp up" and
        "ramp down" steps around each optimiser update. Increasing the
        gradient accumulation factor in this instance reduces the proportion of
        time spent in the "ramp up" and "ramp down" phases, increasing overall
        throughput.

        When training involves multiple replicas, including the cases of sharded
        and phased execution, each optimiser step incurs a communication cost
        associated with the reduction of the gradients. By accumulating
        gradients, you can reduce the total number of updates required and thus
        reduce the total amount of communication.

        .. note::  Increasing the global batch size can have adverse effects on
           the sample efficiency of training so it is recommended to use a low
           or unity gradient accumulation count initially, and then try
           increasing to achieve higher throughput. You may also need to scale
           other hyper-parameters such as the optimiser learning rate
           accordingly.
        """

        self.set(gradient_accumulation=gradient_accumulation)

        return self

    def _check_reduction_arg(self, reduction_type, name):
        incorrect_instance = not isinstance(reduction_type,
                                            enums.ReductionType)
        no_red = reduction_type == enums.ReductionType.NoReduction
        if incorrect_instance or no_red:
            raise ValueError(name + " must be set to "
                             "poptorch.ReductionType.Mean or "
                             "poptorch.ReductionType.Sum")

    def accumulationAndReplicationReductionType(
            self, reduction_type: "poptorch.ReductionType"
    ) -> "poptorch.options._TrainingOptions":
        """Set the type of reduction applied to reductions in the graph.

        When using, a value for greater than one for
        :py:func:`~poptorch.options._TrainingOptions.gradientAccumulation` or
        for :py:func:`~poptorch.Options.replicationFactor`, PopTorch applies a
        reduction to the gradient outputs from each replica, and to the
        accumulated gradients. This reduction is independent of the model loss
        reduction (summing a mean-reduced loss and a sum-reduced loss in a
        PyTorch model is valid).

        This setting governs both the accumulation of the loss gradients in
        replicated graphs and of all of the gradients when using gradient
        accumulation.

        :param reduction_type:
            * Mean (default): Reduce gradients by calculating the mean of them.
            * Sum: Reduce gradients by calculating the sum of them.
        """
        self._check_reduction_arg(reduction_type,
                                  "accumulationAndReplicationReductionType")

        self.set(accumulation_and_replication_reduction_type=reduction_type)
        self._warnings_disabled.add(
            "accumulation_and_replication_reduction_type")
        return self

    def setMeanAccumulationAndReplicationReductionStrategy(
            self, mean_reduction_strategy: "poptorch.MeanReductionStrategy"
    ) -> "poptorch.options._TrainingOptions":
        """Specify when to divide by a mean reduction factor when
        ``accumulationAndReplicationReductionType`` is set to
        ``ReductionType.Mean``.

        The default reduction strategy depends on the optimizer used. The
        default strategy is `Running` when the `accum_type` of the optimizer is
        set to half-precision (float16) format. Otherwise the `Post` strategy
        is used as this strategy is typically more performant but the `Post`
        strategy is less numerically robust.

        :param mean_reduction_strategy:
            * Running: Keeps the reduction buffer as the current mean. This is
              preferred for numerical stability as the buffer value is never
              larger than the magnitude of the largest micro batch gradient.
            * Post: Divides by the accumulationFactor and replicatedGraphCount
              after all of the gradients have been reduced. In some cases this
              can be faster then using Running, however is prone to overflow.
            * PostAndLoss (deprecated): Divides by the replicatedGraphCount
              before the backwards pass, performs the gradient reduction
              across micro batches, and then divides by the accumulationFactor.
              This is to support legacy behaviour and is deprecated.
        """
        self.set(meanAccumulationAndReplicationReductionStrategy=
                 mean_reduction_strategy)
        return self

    def setAutomaticLossScaling(self, enabled: bool
                                ) -> "poptorch.options._TrainingOptions":
        """Set whether automatic loss scaling is enabled on the IPU.

        When using float16/half values for activations, gradients, and weights,
        the loss value needs to be scaled by a constant factor to avoid
        underflow/overflow. This adjustment is known as loss scaling. This
        setting automatically sets a global loss scaling factor during training.

        Note: Automatic loss scaling is a preview feature. It is well tested and
        enabled in some of our example applications, but may not behave as
        expected in all models. Recommendation: if your model with automatic
        loss scaling enabled does not converge or triggers a compilation error,
        then you will need to set the loss scale manually.

        :param enabled:
            * True: Enable automatic loss scaling on the IPU.
            * False: Disable automatic loss scaling.
        """
        self._popart_options.set("automaticLossScalingSettings.enabled",
                                 enabled)
        return self

    def setConvolutionDithering(self, enabled: bool
                                ) -> "poptorch.options._TrainingOptions":
        """Enable convolution dithering.

        If true, then convolutions with different parameters will be laid out
        from different tiles in an effort to improve tile balance in models.

        Use ``MultiConv`` to apply this option to specific set of convolutions.

        :param enabled:
            Enables or disables convolution dithering for all convolutions.
        """

        self._popart_options.set("convolutionOptions",
                                 {"enableConvDithering": enabled})
        return self


class _PopartOptions:
    """Options specific to the PopART backend.

    Only for advanced users.

    Most options from `popart.SessionOptions` can be set using this class.

    .. note:: there is no mapping for the various PopART enums so integers need
    to be used instead.

    Can be accessed via :py:attr:`poptorch.Options._Popart`:

    >>> opts = poptorch.Options()
    >>> opts._Popart.set("autoRecomputation", 3) # RecomputationType::Pipeline
    >>> opts._Popart.set("syntheticDataMode",
    >>>                  int(popart.SyntheticDataMode.RandomNormal))
    """

    def __init__(self) -> None:
        self._is_frozen = False
        self.options = {}
        self.set("instrumentWithHardwareCycleCounter", False)
        self.set("rearrangeAnchorsOnHost", False)

    def __deepcopy__(self, memory):
        copied_options = _PopartOptions()
        memory[id(self)] = copied_options
        for key, val in self.__dict__.items():
            if key == '_is_frozen':
                val = False
            setattr(copied_options, key, copy.deepcopy(val, memory))
        return copied_options

    def checkIsFrozen(self, option=None):
        # Skip check during object initialization.
        if hasattr(self, '_is_frozen'):
            if option != '_is_frozen' and self._is_frozen:
                raise AttributeError("Can't modify frozen Options")

    def set(self, key: str, value: Union[int, float, str, List[str], Set[str]]
            ) -> "poptorch.options._PopartOptions":
        self.checkIsFrozen()

        self.options[key] = value
        return self

    def setEngineOptions(self, engine_options: Dict[str, str]
                         ) -> "poptorch.options._PopartOptions":
        self.set('engineOptions', engine_options)
        return self

    def setPatterns(self, patterns: Dict[str, bool],
                    level: int = 2) -> "poptorch.options._PopartOptions":
        """Override the default patterns of PopART's compiler.

        :param patterns: Dictionary of pattern names to
            enable / disable.
        :param level: Integer value corresponding to the
            ``popart::PatternsLevel`` to use to initialise the ``Patterns``.
        """
        assert isinstance(level, int)
        assert isinstance(patterns, dict)
        self.set("patterns_level", level)
        self.set("patterns", patterns)
        return self

    def __repr__(self):
        repr_body = ", ".join(f"{k}={v.__repr__()}"
                              for k, v in self.options.items())
        return f"{type(self).__name__}({repr_body})"


class _DistributedOptions(_options_impl.OptionsDict):
    """Options related to distributed execution.

    You should not use these when using PopRun/PopDist. Instead use
    ``popdist.poptorch.Options`` to set these values automatically.

    Can be accessed via :py:attr:`poptorch.Options.Distributed`:

    >>> opts = poptorch.Options()
    >>> opts.Distributed.configureProcessId(0, 2)
    """

    def __init__(self) -> None:
        self._gcd_mappings = {}
        super().__init__(num_distributed_processes=1,
                         distributed_process_id=0,
                         ipuof_configs={})
        self.setEnvVarNames("OMPI_COMM_WORLD_SIZE", "OMPI_COMM_WORLD_RANK")

    def disable(self) -> "poptorch.options._DistributedOptions":
        """Ignore the current options / environment variables and disable
        distributed execution.
        """
        self.set(num_distributed_processes=1, distributed_process_id=0)
        return self

    def setEnvVarNames(self, var_num_processes: str, var_process_id: str
                       ) -> "poptorch.options._DistributedOptions":
        """Utility to read and set `processId` and `numProcesses` from
        environment variables.

        Useful if you use a third party library to manage the processes used for
        the distributed execution such as mpirun.

        For example: ``mpirun -np 4 myscript.py``

        By default the OpenMPI ``OMPI_COMM_WORLD_SIZE`` and
        ``OMPI_COMM_WORLD_RANK`` variables are used.
        """
        return self.configureProcessId(
            int(os.environ.get(var_process_id, "0")),
            int(os.environ.get(var_num_processes, "1")))

    def configureProcessId(self, process_id: int, num_processes: int
                           ) -> "poptorch.options._DistributedOptions":
        """Manually set the current process ID and the total number of processes.

        :param int process_id: The ID of this process.
        :param int num_processes: The total number of processes the execution is
            distributed over.
        """
        self.set(distributed_process_id=process_id)
        self.set(num_distributed_processes=num_processes)
        return self

    @property
    def processId(self) -> int:
        """Id of the current process."""
        return self.distributed_process_id

    @property
    def numProcesses(self) -> int:
        """Total number of processes the execution is distributed over."""
        return self.num_distributed_processes


class TensorLocationSettings(_options_impl.OptionsDict):
    """Define where a tensor is stored

    >>> opts = poptorch.Options()
    >>> opts.TensorLocations.setActivationLocation(
    ...     poptorch.TensorLocationSettings().useOnChipStorage(False))
    """

    def minElementsForOffChip(self, min_elements: int
                              ) -> "poptorch.TensorLocationSettings":
        """A minimum number of elements below which offloading
        won't be considered."""
        assert isinstance(min_elements, int)
        self.createOrSet(minElementsForOffChip=min_elements)
        return self

    def minElementsForReplicatedTensorSharding(
            self, min_elements: int) -> "poptorch.TensorLocationSettings":
        """Only enable replicated tensor sharding (RTS) for tensors with more
        than `min_elements` elements."""
        assert isinstance(min_elements, int)
        self.createOrSet(minElementsForReplicatedTensorSharding=min_elements)
        return self

    def useOnChipStorage(self, use: bool = True
                         ) -> "poptorch.TensorLocationSettings":
        """Permanent tensor storage

        :param bool use:
            True: use on chip memory.
            False: use off chip memory.
            None: keep it undefined.
        """
        if use is None:
            self.deleteIfExists("onChip")
        else:
            assert isinstance(use, bool)
            self.createOrSet(onChip=int(use))
        return self

    def useReplicatedTensorSharding(self, use: bool = True
                                    ) -> "poptorch.TensorLocationSettings":
        """Enable replicated tensor sharding

        (relevant for weights and optimiser states)
        """
        assert isinstance(use, bool)
        self.createOrSet(useReplicatedTensorSharding=int(use))
        return self

    def useIOTilesToLoad(self, use: bool = True
                         ) -> "poptorch.TensorLocationSettings":
        """Load tensor through IO tiles

        :param use: Use IO tiles if True,
                    use Compute tiles if False.
        """
        assert isinstance(use, bool)
        self.createOrSet(useIOTilesToLoad=int(use))
        return self

    def useIOTilesToStore(self, use: bool = True
                          ) -> "poptorch.TensorLocationSettings":
        """Use IO tiles to store tensors.

        (relevant for replicated tensor sharded tensors)

        :param use: Use IO tiles if True,
                    use Compute tiles if False.
        """
        assert isinstance(use, bool)
        self.createOrSet(useIOTilesToStore=int(use))
        return self


class _TensorLocationOptions(_options_impl.OptionsDict):
    """Options controlling where to store tensors.

    Can be accessed via :py:attr:`poptorch.Options.TensorLocations`:

    >>> opts = poptorch.Options()
    >>> opts.TensorLocations.setActivationLocation(
    ...     poptorch.TensorLocationSettings().useOnChipStorage(False))
    """

    def numIOTiles(self, num_tiles: int) -> "poptorch.TensorLocationSettings":
        """ Assigns the number of tiles on the IPU to be IO rather than compute.

        Allocating IO (input/output) tiles reduces the number of IPU tiles
        available for computation but allows you to reduce the latency of
        copying tensors from host to the IPUs using the function
        :py:func:`~poptorch.set_overlap_for_input`, IPUs to host using the
        function
        :py:func:`~poptorch.set_overlap_for_output` or to use off-chip memory
        with reduced by setting the option
        :py:meth:`~poptorch.TensorLocationSettings.useIOTilesToLoad`.
        As reducing the number of computation tiles may reduce performance, you
        should not use any IO tiles until you have successfully run your model
        and used profiling to identify "streamCopy" entries which take up a
        significant proportion of execution time.
        """
        assert isinstance(num_tiles, int)

        err_msg = "numIOTiles must be an even number between 32 and 192."

        assert num_tiles >= 32, err_msg
        assert num_tiles <= 192, err_msg
        assert num_tiles % 2 == 0, err_msg

        self.createOrSet(numIOTiles=num_tiles)
        return self

    def setActivationLocation(self, location: "poptorch.TensorLocationSettings"
                              ) -> "poptorch.options._TensorLocationOptions":
        """
        :param location:
            Update tensor location settings for activations.
        """
        assert isinstance(location, TensorLocationSettings)
        self.createOrSet(location_activation=location.toDict())
        return self

    def setWeightLocation(self, location: "poptorch.TensorLocationSettings"
                          ) -> "poptorch.options._TensorLocationOptions":
        """
        :param location:
            Update tensor location settings for weights.
        """
        assert isinstance(location, TensorLocationSettings)
        self.createOrSet(location_weight=location.toDict())
        return self

    def setOptimizerLocation(self, location: "poptorch.TensorLocationSettings"
                             ) -> "poptorch.options._TensorLocationOptions":
        """
        :param location:
            Update tensor location settings for optimiser states.
        """
        assert isinstance(location, TensorLocationSettings)
        self.createOrSet(location_optimizer=location.toDict())
        return self

    def setAccumulatorLocation(self,
                               location: "poptorch.TensorLocationSettings"
                               ) -> "poptorch.options._TensorLocationOptions":
        """
        :param poptorch.TensorLocationSettings location:
            Update tensor location settings for accumulators.
        """
        assert isinstance(location, TensorLocationSettings)
        self.createOrSet(location_accumulator=location.toDict())
        return self


BlockId = str


class Stage:
    """
    The various execution strategies are made of `Stages`: a stage consists of
    one of more `Blocks` running on one IPU.

    .. seealso:: :py:class:`~poptorch.PipelinedExecution`,
        :py:class:`~poptorch.ShardedExecution`,
        :py:class:`~poptorch.ParallelPhasedExecution`,
        :py:class:`~poptorch.SerialPhasedExecution`.
    """

    def __init__(self, *block_ids: BlockId) -> None:
        assert all(isinstance(b, str) for b in block_ids), (
            "Block IDs are "
            f"supposed to be strings but got {block_ids}")
        self._blocks = block_ids
        self._stage_id = -1
        self._phase_id = -1
        self._ipu = None

    @property
    def blocks(self) -> List[BlockId]:
        """List of blocks this stage is made of."""
        return self._blocks

    def ipu(self, ipu: int) -> "poptorch.Stage":
        """Set the IPU on which this stage will run"""
        assert isinstance(ipu, int)
        self._ipu = ipu
        return self

    def _setStage(self, stage: int) -> "poptorch.Stage":
        if stage is not None:
            self._stage_id = stage
        return self


class _DefaultStageManager(_options_impl.IStageManager):
    def __init__(self, auto_stage: "poptorch.AutoStage") -> None:
        super().__init__()
        self._next_id = 1
        self._block_map = {}
        self._auto_stage = auto_stage

    def getStage(self, block_id: BlockId) -> "poptorch.Stage":
        if block_id not in self._block_map:
            stage = Stage(block_id)
            if self._auto_stage == enums.AutoStage.SameAsIpu:
                assert self._current_ipu is not None, (
                    f"poptorch.AutoStage.SameAsIpu was selected but no "
                    f"IPU was specified for block {block_id}")
                stage_id = self._current_ipu
            else:
                stage_id = self._next_id
                self._next_id += 1

            stage._setStage(stage_id)  # pylint: disable=protected-access
            self._block_map[block_id] = stage
        return self._block_map[block_id]


class _IExecutionStrategy:
    def __init__(self, stages_manager, block_map):
        self._block_map = block_map
        self._stages_manager = stages_manager

    def stage(self, block_id):
        """Return the :py:class:`~poptorch.Stage` the given block is belongs to.

        :param str block_id: A block ID.
        """
        assert block_id in self._block_map, f"Unknown block {block_id}"
        return self._block_map[block_id]

    def onStartTracing(self):
        self._stages_manager.clearDebug()
        ops.Block._stages_manager = self._stages_manager  # pylint: disable=protected-access

    def onEndTracing(self):
        self._stages_manager.printDebug()
        ops.Block._stages_manager = None  # pylint: disable=protected-access

    def backendOptions(self):
        return {}


class Phase:
    """Represents an execution phase"""

    def __init__(self, *arg: Union[BlockId, "poptorch.Stage"]):
        """ Create a phase.

        :param arg: must either be one or more
            :py:class:`Stages<poptorch.Stage>`, or one or more
            blocks ``user_id``.

        If one or more strings are passed they will be interpreted as
        :py:class:`~poptorch.Block` IDs representing a single
        :py:class:`~poptorch.Stage`.

        Within a ``Phase``, the stages will be executed in parallel.

        >>> with poptorch.Block("A"):
        ...     layer()
        >>> with poptorch.Block("B"):
        ...     layer()
        >>> p = Phase(poptorch.Stage("A").ipu(0))
        >>> # 2 stages made of one block each
        >>> p = Phase(poptorch.Stage("A").ipu(0), poptorch.Stage("B").ipu(1))
        >>> p = Phase("A","B") # One Stage made of 2 blocks
        """
        if all(isinstance(elt, Stage) for elt in arg):
            self.stages = arg
        else:
            assert all(isinstance(elt, str) for elt in arg), \
                "All arguments must either be block IDs (strings) or " \
                "Stages: " + str([type(elt) for elt in arg])
            self.stages = [Stage(*arg)]

    def stage(self, idx):
        return self.stages[idx]

    def ipus(self, *ipus):
        """Assign one IPU for each stage contained in this Phase.

        The number of IPUs passed must match the number of stages in the Phase.
        """
        assert len(ipus) == len(self.stages), (
            f"Phase contains "
            f"{len(self.stages)} stages but you provided {len(ipus)} ipus")
        for stage, ipu in zip(self.stages, ipus):
            stage.ipu(ipu)


class PipelinedExecution(_IExecutionStrategy):
    def __init__(self, *args):
        """Pipeline the execution of the graph partitions.
        These partitions can be:
        a :py:class:`~poptorch.Stage`, a :py:class:`~poptorch.Block`
        or a :py:class:`~poptorch.BeginBlock`.
        If none of these are passed, an :py:class:`~poptorch.AutoStage` strategy
        can be passed instead to decide how the stage IDs are created.
        By default, `poptorch.AutoStage.SameAsIpu` is used: The stage ID
        will be set to the selected IPU number.
        This implies that each unique :py:class:`~poptorch.Block` or
        :py:class:`~poptorch.BeginBlock` in the graph must have
        their `ipu_id` explicitly set when using `AutoStage`.

        Example 1: Blocks `user_id` are known, IPUs are inferred.

        >>> with poptorch.Block("A"):
        ...     layer1()
        >>> with poptorch.Block("B"):
        ...     layer2()
        >>> with poptorch.Block("C"):
        ...     layer3()
        >>> with poptorch.Block("D"):
        ...     layer4()
        >>> opts = poptorch.Options()
        >>> # Create a 4 stages pipeline based on `user_id`, 4 IPUs will be used.
        >>> opts.setExecutionStrategy(poptorch.PipelinedExecution("A","B",
        ...                                                       "C","D"))

        Stages can also be set explicitly:

        >>> # Create a 2 stages pipeline with the blocks `user_id`, 2 IPUs will be used.
        >>> opts.setExecutionStrategy(poptorch.PipelinedExecution(
        ...    poptorch.Stage("A","B"),
        ...    poptorch.Stage("C","D")))

        Example 2: Blocks `ipu_id` are known, use default AutoStage.

        >>> poptorch.Block.useAutoId()
        >>> with poptorch.Block(ipu_id=0):
        ...     layer1()
        >>> with poptorch.Block(ipu_id=1):
        ...     layer2()
        >>> with poptorch.Block(ipu_id=2):
        ...     layer3()
        >>> with poptorch.Block(ipu_id=3):
        ...     layer4()
        >>> # Automatically create a 4-stage pipeline matching the block `ipu_id`.
        >>> opts.setExecutionStrategy(poptorch.PipelinedExecution())
        >>> # Note: poptorch.PipelinedExecution()
        >>> # is the default execution strategy when blocks are defined.

        Example 3:  Non-consecutive stages placed on the same IPU.

        >>> with poptorch.Block(ipu_id=0):
        ...     layer1()
        >>> with poptorch.Block(ipu_id=1):
        ...     layer2()
        >>> with poptorch.Block(ipu_id=0):
        ...     layer3()
        >>> # Automatically create a 3-stage pipeline forcing the stage
        >>> # IDs to be incremental.
        >>> opts.setExecutionStrategy(poptorch.PipelinedExecution(
        ...                           poptorch.AutoStage.AutoIncrement))

        :param args: Either a :py:class:`~poptorch.AutoStage` strategy or an
            explicit list of stages or block IDs.
        :type args: poptorch.AutoStage, [str], [poptorch.Stage]

        """
        block_map = {}
        auto_stage = enums.AutoStage.SameAsIpu
        if len(args) == 1 and isinstance(args[0], enums.AutoStage):
            auto_stage = args[0]
        else:
            for stage_id, arg in enumerate(args):
                # arg must either be a Stage, a block_id or a list of block_ids
                if isinstance(arg, Stage):
                    stage = arg
                elif isinstance(arg, str):
                    stage = Stage(arg)
                else:
                    assert all(isinstance(elt, str) for elt in arg)
                    stage = Stage(*arg)
                stage._setStage(stage_id)  # pylint: disable=protected-access
                for block in stage.blocks:
                    assert block not in block_map, (
                        f"{block} associated "
                        f"with more than one stage")
                    logger.debug(
                        "block %s added to stage %d%s", block, stage_id,
                        " on IPU %d" %
                        stage._ipu if stage._ipu is not None else '')
                    block_map[block] = stage

        if block_map:

            class PipelineStageManager(_options_impl.IStageManager):
                def __init__(self, block_map):
                    super().__init__()
                    self._block_map = block_map

                def getStage(self, block_id):
                    assert block_id in self._block_map, (
                        f"Unknown Block "
                        f"'{block_id}' list of expected Blocks: "
                        f"{list(self._block_map.keys())}")
                    return self._block_map[block_id]

            stages_manager = PipelineStageManager(block_map)
        else:
            stages_manager = _DefaultStageManager(auto_stage)
        super().__init__(stages_manager, block_map)

    def backendOptions(self):
        return {"execution_mode": 0}


class ShardedExecution(PipelinedExecution):
    """Will shard the execution of the passed Stages or if no stage is passed
    will consider each unique Block `ipu_id` encountered during tracing as a
    different stage.

    >>> with poptorch.Block(ipu_id=0):
    ...     layer()
    >>> with poptorch.Block(ipu_id=1):
    ...     layer()
    >>> with poptorch.Block(ipu_id=2):
    ...     layer()
    >>> opts = poptorch.Options()
    >>> # Automatically create 3 shards based on the block names
    >>> opts.setExecutionStrategy(poptorch.ShardedExecution())

    :param args: Either a :py:class:`~poptorch.AutoStage` strategy or an
        explicit list of stages or block IDs.
    :type args: poptorch.AutoStage, [str], [poptorch.Stage]

    """

    def backendOptions(self):
        return {"execution_mode": 1}


class _IPhasedExecution(_IExecutionStrategy):
    """Common interface for Phased execution strategies"""

    def __init__(self, *phases: Union["poptorch.Phase", List["poptorch.Stage"],
                                      List[BlockId]]):
        """Execute the model's blocks in phases

        :param phases: Definition of phases must be either:

            - a list of :py:class:`~poptorch.Phase`
            - a list of list of :py:class:`~poptorch.Stage`
            - a list of list of :py:class:`~poptorch.Block` IDs (Each list of
              blocks will be considered as a single :py:class:`~poptorch.Stage`)
        :type phases: [:py:class:`~poptorch.Phase`],
            [[:py:class:`~poptorch.Stage`]], [[str]]

        """
        self._tensors_liveness = enums.Liveness.AlwaysLive
        self._separate_backward_phase = False
        self._phases = []
        block_map = {}
        for phase_id, args in enumerate(phases):
            if isinstance(args, Phase):
                phase = args
            else:
                if not isinstance(args, list):
                    args = [args]
                phase = Phase(*args)
            self._phases.append(phase)
            for _, stage in enumerate(phase.stages):
                stage._phase_id = phase_id
                for block in stage.blocks:
                    assert block not in block_map, (f"{block} associated "
                                                    "with more than one stage")
                    logger.debug(
                        "block %s added to phase %d%s", block, phase_id,
                        " on IPU %d" %
                        stage._ipu if stage._ipu is not None else '')
                    block_map[block] = stage
        if phases:

            class PhaseManager(_options_impl.IStageManager):
                def __init__(self, block_map):
                    super().__init__()
                    self._block_map = block_map

                def getStage(self, block_id):
                    assert block_id in self._block_map, (
                        f"Unknown Block "
                        f"'{block_id}' list of expected Blocks: "
                        f"{list(self._block_map.keys())}")
                    return self._block_map[block_id]

            stages_manager = PhaseManager(block_map)
        else:
            # TODO(T30127): Define what the default strategy should be.
            # stages_manager = _DefaultStageManager(enums.AutoStage.SameAsIpu)
            assert phases, (
                "There is currently no AutoStage for "
                "PhasedExecution, please explicitly specify the phases")

        super().__init__(stages_manager, block_map)

    def phase(self, phase: int) -> "poptorch.Phase":
        """Return the requested :py:class:`~poptorch.Phase`

        :param phase: Index of the phase
        """
        assert isinstance(
            phase,
            int) and phase >= 0, "Phases are identified by positive integers"
        return self._phases[phase]

    def useSeparateBackwardPhase(self, use: bool = True):
        """Given a forward pass with 3 phases (0,1,2), by default the phases
        will run as follows: ::

            fwd:       bwd:
            phase 0 -> phase 4
            phase 1 -> phase 3
            phase 2 -> phase 2

        .. note:: The end of the forward pass and the beginning of the backward
            pass are part of the same phase.

        If ``useSeparateBackwardPhase(True)`` is used then no phase
        will be shared between the forward and backward passes: ::

            fwd:       bwd:
            phase 0 -> phase 6
            phase 1 -> phase 5
            phase 2 -> phase 4

        """
        assert isinstance(use, bool)
        self._separate_backward_phase = use
        return self

    def backendOptions(self) -> Dict[str, Union[int, bool]]:
        return {
            "execution_mode": 2,
            "separate_backward_phase": self._separate_backward_phase,
            "tensors_liveness": self._tensors_liveness.value
        }


class ParallelPhasedExecution(_IPhasedExecution):
    """Phases are executed in parallel alternating between two groups of IPUs.

    For example:

    - phase 0 runs on ipu 0 & 2
    - phase 1 runs on ipu 1 & 3
    - phase 2 runs on ipu 0 & 2

    >>> poptorch.Block.useAutoId()
    >>> with poptorch.Block(): # user_id = "0"
    ...     layer()
    >>> with poptorch.Block(): # user_id = "1"
    ...     layer()
    >>> with poptorch.Block(): # user_id = "2"
    ...     layer()
    >>> with poptorch.Block(): # user_id = "3"
    ...     layer()
    >>> with poptorch.Block(): # user_id = "4"
    ...     layer()
    >>> with poptorch.Block(): # user_id = "5"
    ...     layer()
    >>> opts = poptorch.Options()
    >>> strategy = poptorch.ParallelPhasedExecution([
    ...     poptorch.Phase(poptorch.Stage("0"), poptorch.Stage("1")),
    ...     poptorch.Phase(poptorch.Stage("2"), poptorch.Stage("3")),
    ...     poptorch.Phase(poptorch.Stage("4"), poptorch.Stage("5"))])
    >>> strategy.phase(0).ipus(0,2)
    >>> strategy.phase(1).ipus(1,3)
    >>> strategy.phase(2).ipus(0,2)
    >>> opts.setExecutionStrategy(strategy)
    """

    def backendOptions(self) -> Dict[str, Union[int, bool]]:
        return {**super().backendOptions(), "serial_phases_execution": False}

    def sendTensorsOffChipAfterFwd(self, off_chip: bool = True
                                   ) -> "poptorch.ParallelPhasedExecution":
        assert isinstance(off_chip, bool)
        if off_chip:
            self._tensors_liveness = enums.Liveness.OffChipAfterFwd
        else:
            self._tensors_liveness = enums.Liveness.AlwaysLive
        return self


class SerialPhasedExecution(_IPhasedExecution):
    """All the phases run serially on a single group of IPUs.

    For example:

    - phase 0 runs on ipu 0 & 1
    - phase 1 runs on ipu 0 & 1
    - phase 2 runs on ipu 0 & 1

    >>> with poptorch.Block("A"):
    ...     layer()
    >>> with poptorch.Block("A2"):
    ...     layer()
    >>> with poptorch.Block("B"):
    ...     layer()
    >>> with poptorch.Block("B2"):
    ...     layer()
    >>> with poptorch.Block("C"):
    ...     layer()
    >>> with poptorch.Block("C2"):
    ...     layer()
    >>> opts = poptorch.Options()
    >>> strategy = poptorch.SerialPhasedExecution([
    ...     poptorch.Phase(poptorch.Stage("A"), poptorch.Stage("A2")),
    ...     poptorch.Phase(poptorch.Stage("B"), poptorch.Stage("B2")),
    ...     poptorch.Phase(poptorch.Stage("C"), poptorch.Stage("C2"))])
    >>> strategy.phase(0).ipus(0,1)
    >>> strategy.phase(1).ipus(0,1)
    >>> strategy.phase(2).ipus(0,1)
    >>> opts.setExecutionStrategy(strategy)
    """

    def setTensorsLiveness(self, liveness: "poptorch.Liveness"
                           ) -> "poptorch.SerialPhasedExecution":
        """See :py:class:`~poptorch.Liveness` for more information
        """
        assert isinstance(liveness, enums.Liveness)
        self._tensors_liveness = liveness
        return self

    def backendOptions(self) -> Dict[str, Union[int, bool]]:
        return {**super().backendOptions(), "serial_phases_execution": True}


# pylint: disable=too-many-public-methods
class Options(_options_impl.OptionsDict):
    """Set of all options controlling how a model is compiled and executed.

       Pass an instance of this class to the model wrapping functions
       :py:func:`~poptorch.inferenceModel` and
       :py:func:`~poptorch.trainingModel` to change how the model is compiled
       and executed. An instance includes general options set within this class
       such as :py:func:`~poptorch.Options.deviceIterations` as well as
       properties referring to categories of options such as ``Training``.

        >>> opts = poptorch.Options()
        >>> opts.deviceIterations(10)
        >>> opts.Training.gradientAccumulation(4)

    """

    def __init__(self) -> None:
        self._jit = _JitOptions()
        self._popart = _PopartOptions()
        self._graphProcessing = _PrecisionOptions(self._popart)
        self._training = _TrainingOptions(self._popart)
        self._distributed = _DistributedOptions()
        self._tensor_locations = _TensorLocationOptions()
        self._execution_strategy = PipelinedExecution()
        # Don't pass it to super().__init__() -> we don't want it to be passed to the backend with the other
        # options. (It is passed to createGraph() instead).
        self._source_location_excludes = copy.copy(
            _options_impl.default_source_location_excludes)
        self._progress_bar = _options_impl.ProgressBar()

        self.relaxOptimizerAttributesChecks(False)
        self.showCompilationProgressBar(True)
        self._module_namescope_enabled = True
        super().__init__(replication_factor=1,
                         input_group_size=1,
                         input_cgt=enums.CommGroupType.Consecutive,
                         broadcast_buffers=True,
                         device_iterations=1,
                         log_dir=".",
                         max_repeat_logs=4,
                         auto_round_num_ipus=False,
                         anchored_tensors={},
                         output_mode=enums.OutputMode.Default.value,
                         output_return_period=1,
                         connection_type=enums.ConnectionType.Always.value,
                         sync_pattern=enums.SyncPattern.Full.value,
                         available_memory_proportion={})
        path = os.environ.get("POPTORCH_CACHE_DIR", "")
        if path:
            logger.info("POPTORCH_CACHE_DIR is set: setting cache path to %s",
                        path)
            self.enableExecutableCaching(path)
        self.from_json(os.environ.get("POPTORCH_DEFAULT_OPTIONS", r"{}"))

    def from_json(self, string: str):
        """Sets values of the object from a JSON string.

        The format of the JSON string is:

            {"name.of.accessor": value}

        Examples:

            >>> Options().from_json(
            ...     '{"Precision.enableFloatingPointExceptions":true}'
            ... )
            >>> Options().from_json('{"_Popart.set":["OptionName", 1]}')
        """

        def string_to_enum(value):
            try:
                enum_type, enum_value = value.split(".")
            except ValueError:
                return value
            try:
                enum = getattr(getattr(enums, enum_type), enum_value)
            except AttributeError:
                return value
            return getattr(enum, enum_value)

        values_dict = json.loads(string)
        for option, v in values_dict.items():
            active_obj = self
            for attribute in option.split("."):
                active_obj = getattr(active_obj, attribute)
            # This parses strings into enum type and values
            if isinstance(v, str) and "." in v:
                v = string_to_enum(v)
            if isinstance(v, list):
                active_obj(*v)
            else:
                active_obj(v)
        return self

    def sourceLocationExcludes(self,
                               excludes: List[str]) -> "poptorch.Options":
        """ When printing the IR all the frames containing one of the excluded
            strings will be ignored.

            This is helpful to get the IR to trace back to user code rather
            than some function inside a framework.

            :param excludes: Replace the current list of exclusions with this
                             one.
        """

        self._source_location_excludes = excludes
        return self

    def appendToLocationExcludes(self, *excludes: str) -> "poptorch.Options":
        """ When printing the IR all the frames containing one of the excluded
            strings will be ignored.

            This is helpful to get the IR to trace back to user code rather
            than some function inside a framework.

            :param excludes: Append these exclusions to the existing
                             list of exclusions.
        """
        self._source_location_excludes += excludes
        return self

    def showCompilationProgressBar(self,
                                   show: bool = True) -> "poptorch.Options":
        """Show / hide a progress bar while the model is being compiled.
        (The progress bar is shown by default)
        """
        self._show_compilation_progress_bar = show
        return self

    def loadFromFile(self, filepath: str) -> "poptorch.Options":
        """Load options from a config file where each line in the file
        corresponds to a single option being set. To set an option, simply
        specify how you would set the option within a Python script, but omit
        the ``options.`` prefix.

        For example, if you wanted to set ``options.deviceIterations(1)``,
        this would be set in the config file by adding a single line with
        contents ``deviceIterations(1)``.

        This method can be called multiple times on the same `Options`
        object. The options will not be reset to their defaults in between.

        For example, if ``c1.cfg`` contains the following::

            deviceIterations(32)
            replicationFactor(2)

        and ``c2.cfg`` contains the following::

            deviceIterations(4)

        then calling:

        .. code-block:: python

            options.loadFromFile('c1.cfg')
            options.loadFromFile('c2.cfg')

        is equivalent to calling:

        .. code-block:: python

            options.deviceIterations(4)
            options.replicationFactor(2)

        """
        _options_config.parseAndSetOptions(self, filepath)
        return self

    def relaxOptimizerAttributesChecks(self, relax: bool = True
                                       ) -> "poptorch.Options":
        """Controls whether unexpected attributes in
        :py:func:`~poptorch.PoplarExecutor.setOptimizer()` lead to warnings or
        debug messages.

        By default PopTorch will print warnings the first time it encounters
        unexpected attributes in
        :py:func:`~poptorch.PoplarExecutor.setOptimizer()`.

        :param relax:
            * True: Redirect warnings to the debug channel.
            * False: Print warnings about unexpected attributes (default
              behaviour).
        """
        # Doesn't need to be stored in the OptionsDict because it's only used
        # by the python side.
        self._relax_optimizer_checks = relax
        return self

    @property
    def TensorLocations(self) -> "poptorch.options._TensorLocationOptions":
        """Options related to tensor locations.

        .. seealso:: :py:class:`~poptorch.options._TensorLocationOptions`"""
        return self._tensor_locations

    @property
    def Distributed(self) -> "poptorch.options._DistributedOptions":
        """Options specific to running on multiple IPU server (IPU-POD).

        You should not use these when using PopRun/PopDist. Instead use
        ``popdist.poptorch.Options`` to set these values automatically.

        .. seealso:: :py:class:`~poptorch.options._DistributedOptions`"""
        return self._distributed

    @property
    def Jit(self) -> "poptorch.options._JitOptions":
        """Options specific to upstream PyTorch's JIT compiler.

        .. seealso:: :py:class:`~poptorch.options._JitOptions`"""
        return self._jit

    @property
    def Precision(self) -> "poptorch.options._PrecisionOptions":
        """Options specific to the processing of the JIT graph prior to lowering
        to PopART.

        .. seealso:: :py:class:`~poptorch.options._PrecisionOptions`"""
        return self._graphProcessing

    @property
    def Training(self) -> "poptorch.options._TrainingOptions":
        """Options specific to training.

        .. seealso:: :py:class:`~poptorch.options._TrainingOptions`"""
        return self._training

    @property
    def _Popart(self) -> "poptorch.options._PopartOptions":
        """Options specific to the PopART backend.
        (Advanced users only)."""
        return self._popart

    def autoRoundNumIPUs(self, auto_round_num_ipus: bool = True
                         ) -> "poptorch.Options":
        """Whether or not to round up the number of IPUs used automatically: the
        number of IPUs requested must be a power of 2. By default, an error
        occurs if the model uses an unsupported number of IPUs
        to prevent you unintentionally overbooking IPUs.

        :param auto_round_num_ipus:
            * True: round up the number of IPUs to a power of 2.
            * False: error if the number of IPUs is not supported.

        """
        self.set(auto_round_num_ipus=auto_round_num_ipus)
        return self

    def deviceIterations(self, device_iterations: int) -> "poptorch.Options":
        """Number of iterations the device should run over the data before
        returning to the user (default: 1).

        This is equivalent to running the IPU in a loop over that the specified
        number of iterations, with a new batch of data each time. However,
        increasing ``deviceIterations`` is more efficient because the loop runs
        on the IPU directly.
        """
        self.set(device_iterations=device_iterations)
        return self

    def setExecutionStrategy(
            self, strategy: Union["poptorch.ParallelPhasedExecution",
                                  "poptorch.SerialPhasedExecution"]
    ) -> "poptorch.Options":
        """Set the execution strategy to use to partition the graph.

        :param strategy:
            Must be an instance of once of the execution strategy classes.

        .. seealso:: :py:class:`~poptorch.PipelinedExecution`,
            :py:class:`~poptorch.ShardedExecution`,
            :py:class:`~poptorch.ParallelPhasedExecution`,
            :py:class:`~poptorch.SerialPhasedExecution`.
        """
        assert isinstance(strategy, _IExecutionStrategy)
        self._execution_strategy = strategy
        return self

    def setAvailableMemoryProportion(
            self, available_memory_proportion: Dict[str, float]):
        """Sets the amount of temporary memory made available on a per-IPU basis.

        Use this setting to control the amount of temporary memory available to
        operations such as:

        * convolution
        * matrix multiplication
        * embedding lookups
        * indexing operations

        Parameter should be a dictionary of IPU IDs and float values between 0
        and 1. (for example, ``{"IPU0": 0.5}``)

        The floating point value has the same meaning and effect as documented
        in :py:func:`~poptorch.set_available_memory`.
        """
        actual_memory = {}

        for key, mem in available_memory_proportion.items():
            assert key.startswith("IPU"), (
                "Available memory proportions are expected"
                " to be in a dictionary of {\"IPU0\": 0.5}"
                " where the 0 in IPU is the index of the"
                " IPU. Invalid key: %s" % key)

            ipu_id = int(key[3:])
            actual_memory[ipu_id] = mem

        self.createOrSet(available_memory_proportion=actual_memory)
        return self

    def replicationFactor(self, replication_factor: int) -> "poptorch.Options":
        """Number of times to replicate the model (default: 1).

        Replicating the model increases the data throughput of the model as
        PopTorch uses more IPUs. This leads to the number of IPUs used being
        scaled by ``replication_factor``, for example, if your model uses 1 IPU,
        a ``replication_factor`` of 2 will use 2 IPUs; if your model uses 4
        IPUs, a replication factor of 4 will use 16 IPUs in total.

        :param replication_factor:
            Number of replicas of the model to create.
        """
        self.set(replication_factor=replication_factor)
        return self

    def inputReplicaGrouping(self, input_group_size: int,
                             input_group_type: "poptorch.CommGroupType"
                             ) -> "poptorch.Options":
        """Allows the input batches to be split between groups of replicas, in
        a similar way to what :py:func:`~replicaGrouping` does for weight
        tensors.

        :param input_group_size:
            Number of replicas to place in each input replica group. Must be a
            factor of ``replication_factor``. Defaults to 1, which will divide
            the input evenly among all replicas.

        :param input_group_type:
            Arrangement type to use when placing replicas into input replica
            groups. Cannot be ``poptorch.CommGroupType.All``. Defaults to
            ``poptorch.CommGroupType.Consecutive``. For an explanation of the
            arrangement types, see :py:class:`~poptorch.CommGroupType` and
            :numref:`grouping_tensor_weights`.
        """
        if input_group_type == enums.CommGroupType.NoGrouping:
            input_group_size = 1
            input_group_type = enums.CommGroupType.Consecutive
        if input_group_type == enums.CommGroupType.All:
            raise ValueError('input_group_type cannot be All')
        if self.replication_factor < input_group_size:
            raise ValueError('input_group_size cannot be larger than '
                             'replication_factor')
        if (self.replication_factor % input_group_size) != 0:
            raise ValueError('input_group_size must be a factor of '
                             'replication_factor')
        self.set(input_group_size=input_group_size)
        self.set(input_cgt=input_group_type)
        return self

    def broadcastBuffers(self, broadcast_buffers: bool = True):
        """Broadcast buffers to all replicas.

        Only non-broadcast buffers are currently supported, which means each
        replica will hold a set of buffers not in sync with other replicas'
        buffers. To enable non-broadcast buffers, set this option to `False`.
        """
        self.set(broadcast_buffers=broadcast_buffers)
        return self

    def logDir(self, log_dir: str) -> "poptorch.Options":
        """Set the log directory

        :param log_dir:
            Directory where PopTorch saves log files (default: current
            directory)
        """
        self.set(log_dir=log_dir)
        return self

    def maxRepeatLogs(self, max_lines: Optional[int]) -> "poptorch.Options":
        """For often-repeated log lines, set the maximum number of repeated
           lines that will be logged.

        :param max_lines:
            If `None`, show all log messages. Otherwise suppress repeated
            messages after `max_lines` lines. The default is to suppress after
            4 lines.
        """
        self.set(max_repeat_logs=(1 << 64) -
                 1 if max_lines is None else max_lines)
        return self

    def modelName(self, name: str) -> "poptorch.Options":
        """Set the model name

        :param name:
            Name of the model defaults to "inference" or "training" depending
            on the type of model created. Used when profiling to set the
            subdirectory of the report directory to output the profiling too.
        """
        self.createOrSet(model_name=name)
        return self

    def enableExecutableCaching(self, path: str) -> "poptorch.Options":
        """Load/save Poplar executables to the specified ``path``, using it as
        a cache,  to avoid recompiling identical graphs.

        :param path:
            File path for Poplar executable cache store; setting ``path`` to
            None`` disables executable caching.
        """
        if path is None:
            self._Popart.set("enableEngineCaching", False)
        else:
            self._Popart.set("cachePath", path)
            self._Popart.set("enableEngineCaching", True)
        return self

    def useIpuModel(self, use_model: bool) -> "poptorch.Options":
        """Whether to use the IPU Model or physical hardware (default)

        The IPU model simulates the behaviour of IPU hardware but does not offer
        all the functionality of an IPU. Please see the Poplar and PopLibs User
        Guide for further information.

        This setting takes precedence over the ``POPTORCH_IPU_MODEL``
        environment variable.

        :param use_model:
            * True: Use the IPU Model.
            * False: Use IPU hardware.
        """
        self.createOrSet(use_model=use_model)
        return self

    def connectionType(self, connection_type: "poptorch.ConnectionType"
                       ) -> "poptorch.Options":
        """When to connect to the IPU (if at all).

        :param connection_type:
            * ``Always``: Attach to the IPU from the start (default).
            * ``OnDemand``: Wait until the compilation is complete and the
              executable is ready to be run to attach to the IPU.
            * ``Never``: Never try to attach to an IPU: this is useful for
              offline compilation, but trying to run an executable will raise
              an exception.

        For example:

        >>> opts = poptorch.Options()
        >>> opts.connectionType(poptorch.ConnectionType.OnDemand)
        """
        assert isinstance(connection_type, enums.ConnectionType)
        self.set(connection_type=connection_type.value)
        return self

    def syncPattern(self, sync_pattern: "poptorch.SyncPattern"
                    ) -> "poptorch.Options":
        """Controls synchronisation in multi-IPU systems.

        This option can be used to allow subsets of IPUs to overlap their work.
        For example, one set of IPUs could be communicating with the host
        while other IPUs are processing data.

        This option is typically used together with replicated execution, in
        which case it takes effect on a per-replica basis. If replication is
        not used, it will apply to all IPUs.

        :param sync_pattern:
            * ``Full``: Require all IPUs to synchronise on every communication
              between IPUs or between IPUs and host. This is the default.
            * ``SinglePipeline``: Allow IPUs to synchronise with the host
              independently, without having to synchronise with each other.
              This permits any one IPU to perform host IO while other IPUs are
              processing data.
            * ``ReplicaAndLadder``: Allow an IPU group to communicate with the
              host without requiring synchronisation between groups. This
              permits multiple IPU groups to alternate between performing host
              IO and computation.
        """
        assert isinstance(sync_pattern, enums.SyncPattern)
        self.set(sync_pattern=sync_pattern.value)
        return self

    def useIpuId(self, ipu_id: int) -> "poptorch.Options":
        """ Use the IPU device specified by the ID (as provided by
        `gc-info <https://docs.graphcore.ai/projects/command-line-tools/en/latest/gc-info_main.html>`__).

        A device ID may refer to a single or to a group of IPUs (a multi-IPU
        device). The number of IPUs associated with the ID must be equal to the
        number of IPUs used by your annotated model multiplied by the
        replication factor.

        For example if your model uses 1 IPU and the replication factor is 2
        you will need to provide a device ID with 2 IPU; if your model is
        pipelined across 4 IPUs and the replication factor is 4, you will need
        to provide a device ID which represents a multi-IPU device of 16 IPUs.

        You can use the the command-line tool `gc-info`: running `gc-info -l`,
        shows each device ID and a list of IPUs associated with the ID.

        :param int ipu_id: IPU device ID of a single-IPU or multi-IPU device
        """
        assert isinstance(ipu_id, int)
        self.createOrSet(ipu_id=ipu_id)
        return self

    def useOfflineIpuTarget(self, ipu_version: int = 2) -> "poptorch.Options":
        """Create an offline IPU target that can only be used for offline compilation.

        .. note:: the offline IPU target cannot be used if the IPU model is
            enabled.

        :param ipu_version: IPU version to target (1 for Mk1, 2 for Mk2,
                            21 for Mk2 with FP8 support). Default: 2.
        """
        self.connectionType(enums.ConnectionType.Never)
        self.createOrSet(ipu_version=ipu_version)
        return self

    def anchorTensor(self,
                     short_name: str,
                     long_name: str,
                     output_mode: Optional["poptorch.OutputMode"] = None,
                     output_return_period: Optional[int] = 1):
        """Anchor a tensor such that it may be retrieved after a model run.

        :param str short_name: User defined name to be used for retrieval
        :param str long_name: The PopART name of the tensor to be anchored
        :param poptorch.OutputMode output_mode: Specifies when data should
          be returned. Default to None, in which case the tensor will use
          the same output mode used for model outputs.
        :param int output_return_period: Return period if output mode is
          ``EveryN``. Defaults to 1.
        """

        if output_mode != enums.OutputMode.EveryN:
            output_return_period = 1

        value = [long_name, output_mode is None]
        value += [output_mode, output_return_period]
        self.anchored_tensors[short_name] = value

    def outputMode(self,
                   output_mode: "poptorch.OutputMode",
                   output_return_period: Optional[int] = None
                   ) -> "poptorch.Options":
        """ Specify which data to return from a model.

        :param poptorch.OutputMode output_mode:
            * ``All``: Return a result for each batch.
            * ``Sum``: Return the sum of all the batches.
            * ``Final``: Return the last batch.
            * ``EveryN``: Return every N batches: N is passed in
              as ``output_return_period``.
            * Default: `All` for inference, `Final` for training.

        For example:

        >>> opts = poptorch.Options()
        >>> opts.outputMode(poptorch.OutputMode.All)
        ... # or
        >>> opts.outputMode(poptorch.OutputMode.EveryN, 10)
        """
        assert isinstance(output_mode, enums.OutputMode)

        # Check the anchor return period makes sense.
        if output_mode == enums.OutputMode.EveryN:
            assert output_return_period and output_return_period > 0, (
                "EveryN"
                " anchor must have output_return_period set to valid"
                " positive integer")
        elif output_return_period:
            logger.info(
                "Anchor return period argument ignored with output_mode"
                " set to %s", output_mode)

        self.set(output_mode=output_mode.value,
                 output_return_period=output_return_period or 1)
        return self

    def defaultOutputMode(self) -> bool:
        """
        :return:
            * True: :py:func:`~poptorch.Options.outputMode` is currently set to
                default.
            * False: :py:func:`~poptorch.Options.outputMode` is not set to
                default.
        """
        return self.output_mode == enums.OutputMode.Default

    def randomSeed(self, random_seed: int) -> "poptorch.Options":
        """Set the seed for the random number generator on the IPU.

        :param random_seed:
            Random seed integer.
        """
        assert isinstance(random_seed, int)
        torch.manual_seed(random_seed)
        self.createOrSet(random_seed=random_seed)
        return self

    def enableStableNorm(self, enabled: bool) -> "poptorch.Options":
        """Set whether a stable version of norm operators is used.
        This stable version is slower, but more accurate than its
        unstable counterpart.

        :param enabled:
            * True: Use stable norm calculation.
            * False: Do not use stable norm calculation.
        """
        self._Popart.set("enableStableNorm", enabled)
        return self

    def enableSyntheticData(self, enabled: bool) -> "poptorch.Options":
        """Set whether host I/O is disabled and synthetic data
        is generated on the IPU instead. This can be used to benchmark
        models whilst simulating perfect I/O conditions.

        :param enabled:
            * True: Use data generated from a random normal distribution
              on the IPU. Host I/O is disabled.
            * False: Host I/O is enabled and real data is used.
        """
        # popart.SyntheticDataMode
        #   0 = Off
        #   1 = Zeros
        #   2 = RandomNormal
        mode = 2 if enabled else 0
        self._Popart.set("syntheticDataMode", mode)
        return self

    def logCycleCount(self, log_cycle_count: bool) -> "poptorch.Options":
        """Log the number of IPU cycles used in executing the main graph.

        The cycle count will be printed when this option is enabled by
        setting the environment variable ``POPTORCH_LOG_LEVEL=DEBUG``.
        This option requires IPU hardware to run.

        Note: This will have a small detrimental impact on performance.

        :param log_cycle_count:
            * True: Enable logging the IPU cycle count.
            * False: Do not enable IPU cycle count logging.
        """
        self._Popart.set("instrumentWithHardwareCycleCounter", log_cycle_count)
        return self

    def enableProfiling(self, profile_dir: Optional[str] = None
                        ) -> "poptorch.Options":
        """Enable profiling report generation.

        To generate debug information associated with the profiling
        data, please specify ``autoReport.directory``, and either
        ``autoReport.all`` or ``autoReport.outputDebugInfo`` in
        the ``POPLAR_ENGINE_OPTIONS`` environment variable. e.g.

        .. code-block:: bash

            POPLAR_ENGINE_OPTIONS={"autoReport.directory":"/profile/output",\\
            "autoReport.all":"true"}``

        or:

        .. code-block:: bash

            POPLAR_ENGINE_OPTIONS={"autoReport.directory":"/profile/output",\\
            "autoReport.outputDebugInfo":"true"}``

        Debug information and the rest of the profiling data will be stored in
        ``/profile/output directory``. Values specified in the environment
        variable take precedence over ``profile_dir`` when both are given.

        :param str profile_dir: path to directory where report will be created.
            Defaults to current directory.
        """
        env_engine_opts = os.getenv('POPLAR_ENGINE_OPTIONS', default='')
        env_override = ('debug.allowOutOfMemory' in env_engine_opts) or \
                       ('autoReport.directory' in env_engine_opts) or \
                       ('autoReport.all' in env_engine_opts)

        if env_override:
            logger.warning(
                'Profiling setting overridden by environment variable. '
                'Check content of POPLAR_ENGINE_OPTIONS.')

        opts = self._popart.options.get('engineOptions', {})
        opts['debug.allowOutOfMemory'] = 'true'
        opts['autoReport.directory'] = profile_dir or '.'
        opts['autoReport.all'] = 'true'
        self._popart.setEngineOptions(opts)
        return self

    def disableModuleNamescope(self) -> "poptorch.Options":
        """ Disable option adding name scope for each operator
        present in the module. This option is enabled by default.
        The operator name scope is be based on the names appearing
        in the named_modules function from torch.nn.Module.

        For example:

        >>> class Model(torch.nn.Module):
        >>>     def __init__(self, num_groups, num_channels):
        >>>         super().__init__()
        >>>         self.gn = torch.nn.GroupNorm(num_groups, num_channels)
        >>>     def forward(self, x):
        >>>         return self.gn2(x)

        With namescope enabled the name will be gn/GroupNormalization,
        with disabled it will be GroupNormalization.
        """
        self._module_namescope_enabled = False
        return self

    def updatableNamedBuffers(self, buffers: List[str]) -> "poptorch.Options":
        """ List of model named buffers that can be updated with call to
        buffersFromHost(). This allows to update just a subset of model weights
        instead of all or them as it happens with weightsFromHost() call.
        """
        self._Popart.set('updatableNamedBuffers', buffers)
        return self

    def toDict(self) -> Dict[str, Any]:
        """ Merge all the options, except for the JIT and Precision
        options, into a single dictionary to be serialised and passed to the C++
        backend.

        At this stage, any warnings are printed based on options set e.g. if
        a default option changes.

        :meta private:
        """
        assert not self.defaultOutputMode(
        ), "An output mode must be picked before serialisation"
        out = self._execution_strategy.backendOptions()
        out.update(self._popart.options)
        out = self.update(out)
        out = self._training.update(out)
        out = self._distributed.update(out)
        out = self._tensor_locations.update(out)

        if self._show_compilation_progress_bar:
            out["compilation_progress_bar_fn"] = self._progress_bar

        return out

    def clone(self) -> "poptorch.Options":
        """Create an unfrozen deep copy of the current options.
        """
        return copy.deepcopy(self)

    def __repr__(self):
        """Repr which recurses through the "properties" of the class to
        find the objects to print."""
        # Call __repr__ on v so that strings display with quotes.
        property_names = [
            p for p in dir(type(self))
            if isinstance(getattr(type(self), p), property)
        ]
        return (f"{type(self).__name__}(" +
                ", ".join(f"{k}={v.__repr__()}"
                          for k, v in self._values.items()) + ", " +
                ", ".join(f"{prop}={getattr(self, prop)}"
                          for prop in property_names) + ")")


================================================
FILE: python/poptorch.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.

#include <torch/csrc/jit/ir/ir.h>
#include <torch/csrc/jit/passes/lower_graph.h>
#include <torch/csrc/jit/python/pybind_utils.h>

#include <ATen/ATen.h>

#include <pybind11/functional.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <pybind11/stl_bind.h>

#include <algorithm>
#include <iterator>
#include <limits>
#include <sstream>
#include <string>
#include <unordered_map>
#include <vector>

#include "poptorch_err/ExceptionHandling.hpp"
#include "poptorch_err/ExceptionInfo.hpp"

#include "poptorch_logging/Error.hpp"
#include "poptorch_logging/LoggingLight.hpp"
#include "poptorch_logging/Tracepoint.hpp"

#include "poptorch/DispatchTracer.hpp"
#include "poptorch/LowerToPopart.hpp"
#include "poptorch/LowerToPopartFactories.hpp"
#include "poptorch/SessionOptionsParser.hpp"
#include "poptorch/Utils.hpp"

#include "popart_compiler/CodeletsCompilation.hpp"
#include "popart_compiler/Compiler.hpp"
#include "popart_compiler/Utils.hpp"

#include "pytorch_bridge/CompilerOptions.hpp"

// All the functions here are called directly from python, therefore it's ok for
// us to catch all exceptions and convert them to PoptorchError
#define PTC(f)                                                                 \
  PoptorchCatchWrapperImpl<poptorch::throwPoptorchError, /*catch_all=*/true,   \
                           decltype(&(f)), f>::wrap

namespace poptorch {
namespace {

// Everything in this namespace is a workaround because
// torch::jit::toTraceableStack() is broken:torch::jit::as_module() fails to
// initialise its static local variable ScriptModule and segfaults as a
// result.
namespace jit {

using namespace torch::jit;

TypePtr inferType(py::handle input) {

  // Try tensor types
  if (THPVariable_Check(input.ptr())) {
    return TensorType::get();
  }

  if (input.is(py::none())) {
    return NoneType::get();
  }

  if (six::isTuple(input)) {
    py::tuple tuple = py::cast<py::tuple>(input);
    std::vector<TypePtr> element_types;
    element_types.reserve(tuple.size());

    for (py::handle elem : tuple) {
      element_types.push_back(inferType(elem));
    }
    return TupleType::create(element_types);
  } else if (PyDict_Check(input.ptr())) {
    // Check to make sure we can generate useful input/output types
    auto dict = py::cast<py::dict>(input);
    size_t len = py::len(dict);
    ERROR_ON_MSG(len == 0, "Dictionary inputs must have entries");
    TypePtr key_type = nullptr;
    TypePtr value_type = nullptr;

    for (auto entry : dict) {
      // Try to infer the key type and unify it with the existing one
      auto entry_key_type = inferType(entry.first);
      auto unified_key = unifyOrInitializeType(key_type, entry_key_type);
      ERROR_ON_MSG(!unified_key,
                   c10::str("Dictionary inputs to traced functions must have "
                            "consistent type. Found ",
                            key_type->repr_str(), " and ",
                            entry_key_type->repr_str()));

      // Try to infer the value type and unify it with the existing one
      auto entry_value_type = inferType(entry.second);
      auto unified_value = unifyOrInitializeType(value_type, entry_value_type);
      ERROR_ON_MSG(!unified_value,
                   c10::str("Dictionary inputs to traced functions must have "
                            "consistent type. Found ",
                            value_type->repr_str(), " and ",
                            entry_value_type->repr_str()));

      key_type = *unified_key;
      value_type = *unified_value;
    }
    return DictType::create(key_type, value_type);
  } else if (PyList_Check(input.ptr())) {
    auto list = py::cast<py::list>(input);
    size_t len = py::len(list);
    ERROR_ON_MSG(len == 0, "List trace inputs must have elements");

    TypePtr element_type = nullptr;
    for (auto elem : list) {
      auto this_element_type = inferType(elem);
      auto unified_type =
          unifyOrInitializeType(element_type, this_element_type);
      ERROR_ON_MSG(!unified_type,
                   c10::str("List inputs to traced functions must have "
                            "consistent element type. Found ",
                            element_type->repr_str(), " and ",
                            this_element_type->repr_str()));
      element_type = *unified_type;
    }
    return ListType::create(element_type);
  }
  ERROR("Only nested lists and tuples of tensors are supported");
}

// Cut down version of torch::jit::toTraceableStack which only supports nested
// tuples and lists of tensors.
Stack toTraceableStack(const py::tuple &inputs) {
  return toIValue(inputs, inferType(inputs)).toTupleRef().elements().vec();
}

} // namespace jit

template <typename Func> class CallOnExit : Func {
public:
  explicit CallOnExit(Func f) : Func(std::move(f)) {}
  ~CallOnExit() { std::invoke(*static_cast<Func *>(this)); }
};

// Keep a static map to gather up all the cpu calls.
CPUCallbackMap callbacks;

bool alreadyRegistered(const std::string &ID) {
  return callbacks.find(ID) != callbacks.end();
}

void registerBuffersWithCallback(
    const std::string &ID,
    std::vector<at::Tensor> &input_tensors, // NOLINT
    std::vector<at::Tensor> &output_tensors // NOLINT
) {
  auto itr = callbacks.find(ID);

  ERROR_ON_MSG(itr == callbacks.end(), "Callback has not been registered.");

  popart_compiler::CallbackMetadata &metadata = itr->second;

  // Track the input tensors. Our python creates a persistent storage location
  // for the inputs and outputs.
  for (at::Tensor &tensor : input_tensors) {
    metadata.input_pointers.push_back(tensor.data_ptr());
  }

  // Same for output.
  for (at::Tensor &tensor : output_tensors) {
    tensor = tensor.contiguous();
    metadata.output_pointers.push_back(tensor.data_ptr());
  }
}

// Python interface to map a given CPU op with the IR calls.
void registerCPUCallBack(const py::object &obj, const std::string &ID) {
  // Map the string identifier to the metadata.
  bool inserted;
  decltype(callbacks)::iterator it;
  std::tie(it, inserted) = callbacks.try_emplace(ID);

  // Skip if we've already added a callback for this function.
  if (!inserted) {
    return;
  }

  // Structure to store the information given by python to be forwarded to the
  // backend.
  popart_compiler::CallbackMetadata &metadata = it->second;

  // Wrap that in a lambda so we don't have to expose the naked pytorch function
  // pointer thing.
  metadata.the_callback = [=]() {
    // We wrap the user call in a function called "execute"
    obj.attr("execute")();
  };

  metadata.buffer_registration_callback = [=]() {
    obj.attr("registerPersistentData")();
  };
}

void initCallbackBuffers() {
  for (auto &pair : callbacks) {
    pair.second.buffer_registration_callback();
  }
}

class PybindValue : public IPyValue {
public:
  template <typename T,
            std::enable_if_t<std::is_base_of<py::object, T>::value, int> = 0>
  explicit PybindValue(T obj) {
    _maybe_obj = obj;
    _value = _maybe_obj;
  }

  template <typename T,
            std::enable_if_t<!std::is_base_of<py::object, T>::value, int> = 0>
  explicit PybindValue(T handle) : _value(handle) {}

  std::function<void(int, int)> toFunction() const override {
    py::function py_func = _value.cast<py::function>();
    return [py_func](int x, int y) {
      py::gil_scoped_acquire acquire;
      py_func(x, y);
    };
  }

  bool isBoolean() const override { return py::isinstance<py::bool_>(_value); }

  bool toBoolean() const override { return _value.cast<bool>(); }

  bool isDouble() const override {
    // Python's float type is actually double
    // precision.
    return py::isinstance<py::float_>(_value);
  }

  double toDouble() const override { return _value.cast<double>(); }

  bool isInt() const override { return py::isinstance<py::int_>(_value); }

  std::uint64_t toUInt64() const override {
    return _value.cast<std::uint64_t>();
  }

  std::int64_t toInt64() const override { return _value.cast<std::int64_t>(); }

  bool isString() const override { return py::isinstance<py::str>(_value); }

  std::string toString() const override {
    if (isString()) {
      return _value.cast<std::string>();
    }
    if (isInt()) {
      return std::to_string(_value.cast<std::uint64_t>());
    }
    ERROR("Don't know how to convert type " << _value.get_type()
                                            << " to string");
  }

  bool isSetListOrTuple() const override {
    return py::isinstance<py::set>(_value) ||
           py::isinstance<py::list>(_value) ||
           py::isinstance<py::tuple>(_value);
  }

  void forEachInList(std::function<void(const IPyValue &)> fn) const override {
    for (auto option : _value.cast<py::list>()) {
      fn(PybindValue(option));
    }
  }

  bool isDict() const override { return py::isinstance<py::dict>(_value); }

  void forEachInDict(std::function<void(const IPyValue &, const IPyValue &)> fn)
      const override {
    for (auto option : _value.cast<py::dict>()) {
      fn(PybindValue(option.first), PybindValue(option.second));
    }
  }

  std::unique_ptr<IPyValue> getFromDict(const std::string &key) const override {
    auto dict = _value.cast<py::dict>();
    if (!dict.contains(key)) {
      return nullptr;
    }
    return std::make_unique<PybindValue>(dict[key.c_str()]);
  }
  std::uint64_t getListSize() const override {
    return _value.cast<py::list>().size();
  }
  std::unique_ptr<IPyValue>
  getFromList(const std::uint64_t index) const override {
    auto list = _value.cast<py::list>();
    if (index >= list.size()) {
      return nullptr;
    }
    return std::make_unique<PybindValue>(list[index]);
  }

  std::string type() const override { return py::str(_value.get_type()); }

private:
  // pybind11 handles do not keep a reference to the python object so it might
  // disappear if the parent doesn't hold a reference to it, so just to be safe
  // keep a reference if possible.
  py::object _maybe_obj;
  py::handle _value;
};

template <typename T>
T getOptimizerValue(const py::dict &d, const std::string &key) {
  ERROR_ON_MSG(!d.contains(key), "Missing optimizer value for '"
                                     << key << "' in "
                                     << py::str(d.cast<py::object>()));
  return d[key.c_str()].cast<T>();
}

template <typename T>
void getOptimizerValue(T &value, const py::dict &d, const std::string &key) {
  value = getOptimizerValue<T>(d, key);
}

void copyParametersDict(popart_compiler::Optimizer *out, const py::dict &in) {
  logging::LogContext ctx_func("copyParametersDict");
  out->parameters.resize(in.size());
  std::uint64_t param_idx = 0;
  for (auto optimizer_field : in) {
    auto &param = out->parameters[param_idx];
    param_idx++;

    const std::string name = optimizer_field.first.cast<std::string>();
    logging::LogContext ctx("attr: " + name);
    std::pair<float, bool> p =
        optimizer_field.second.cast<std::pair<float, bool>>();

    ERROR_ON(name.size() >= sizeof(param.name));
    // We need to use a C-style string here to avoid ABI issues.
    snprintf(reinterpret_cast<char *>(param.name), sizeof(param.name), "%s",
             name.c_str());
    param.value = p.first;
    param.is_const = p.second;
  }
}

// Process the user provided dictionary and extract the relevant optimizer
// information.
std::vector<popart_compiler::Optimizer> parseOptimizers(const py::dict &opt) {
  if (opt.empty()) {
    return {};
  }

  popart_compiler::OptimizerType type = popart_compiler::OptimizerType::NONE;
  std::uint64_t num_groups;
  type = static_cast<popart_compiler::OptimizerType>(
      getOptimizerValue<std::uint64_t>(opt, "optimizer_type"));
  auto defaults = getOptimizerValue<py::dict>(opt, "defaults");
  auto groups = getOptimizerValue<py::list>(opt, "groups");
  num_groups = groups.size();
  std::vector<popart_compiler::Optimizer> optimizers;
  // Note: all the group variables and optimizer variables are
  // automatically forwarded to the Compiler backend however
  // the optimizer attributes are extracted here.
  bool use_tf_variant = false;
  if (type == popart_compiler::OptimizerType::RMSPROP ||
      type == popart_compiler::OptimizerType::RMSPROP_CENTERED) {
    getOptimizerValue(use_tf_variant, opt, "useTfVariant");
  }

  float max_grad_norm = std::numeric_limits<float>::infinity();
  if (opt.contains("maxGradNorm")) {
    getOptimizerValue(max_grad_norm, opt, "maxGradNorm");
  }

  if (opt.contains("accumType")) {
    bool accum_type = false;
    bool first_order_momentum_accum_type = false;
    bool second_order_momentum_accum_type = false;

    // Indicate whether the optimizer should use float16 types
    getOptimizerValue(accum_type, opt, "accumType");

    if (type == popart_compiler::OptimizerType::SGD1 ||
        type == popart_compiler::OptimizerType::SGD2) {
      getOptimizerValue(first_order_momentum_accum_type, opt,
                        "velocityAccumType");
    } else {
      getOptimizerValue(first_order_momentum_accum_type, opt,
                        "firstOrderMomentumAccumType");
      getOptimizerValue(second_order_momentum_accum_type, opt,
                        "secondOrderMomentumAccumType");
    }
    // Create one Optimizer per parameter group + 1 for defaults
    for (std::uint64_t i = 0; i <= num_groups; ++i) {
      optimizers.emplace_back(type, accum_type, first_order_momentum_accum_type,
                              second_order_momentum_accum_type, use_tf_variant,
                              max_grad_norm);
    }
  } else {
    // Create one Optimizer per parameter group + 1 for defaults
    for (std::uint64_t i = 0; i <= num_groups; ++i) {
      optimizers.emplace_back(type, use_tf_variant, max_grad_norm);
    }
  }

  copyParametersDict(optimizers.data(), defaults);
  // For each group copy all the attributes
  // Start at 1: index 0 is 'defaults'
  std::uint64_t group = 1;
  for (auto group_attr : groups) {
    copyParametersDict(&optimizers[group], group_attr.cast<py::dict>());
    ++group;
  }
  return optimizers;
}

std::map<std::string, void *>
getParameterBuffers(const pybind11::tuple &names,
                    const pybind11::tuple &tensors) {
  ERROR_ON(names.size() != tensors.size());
  std::map<std::string, void *> parameters;
  torch::jit::Stack stack = jit::toTraceableStack(tensors);
  for (std::uint64_t i = 0; i < names.size(); ++i) {
    parameters.insert(
        {names[i].cast<std::string>(), stack[i].toTensor().data_ptr()});
  }
  return parameters;
}

// We have three sets of tensors.
// 1. Tensors in the graph from jit::trace.
// 2. Tensors in the original user model.
// 3. Tensors in the graph from jit::trace which lowerGraph has removed unused
// tensors from. We remap them by mapping the indices of 1. to the tensors of 3.
// and then creating a new vector using 3 with that map as a guide to tell us
// which tensors have been culled.
std::vector<at::Tensor>
remapTensors(const pybind11::dict &python_tensors,
             const pybind11::dict &model_parameters,
             const std::vector<at::Tensor> &traced_tensors) {
  // Create a set of the pointers actually in use.
  std::unordered_map<void *, std::size_t> tensor_pointers;

  for (std::size_t i = 0; i < traced_tensors.size(); ++i) {
    tensor_pointers.insert({traced_tensors[i].data_ptr(), i});
  }

  std::vector<at::Tensor> returnee;
  returnee.resize(traced_tensors.size());

  for (auto element : model_parameters) {
    auto option_name = element.first.cast<std::string>();

    // Get the original tensor which the.
    auto dict_itr = python_tensors[element.first];
    at::Tensor traced_tensor = dict_itr.cast<at::Tensor>();

    auto itr = tensor_pointers.find(traced_tensor.data_ptr());
    if (itr != tensor_pointers.end()) {
      at::Tensor tensor = element.second.cast<at::Tensor>();
      returnee[itr->second] = tensor;
    }
  }

  return returnee;
}

// python_names and python_tensors are the parameters from the python trace.
// And trace_tensors is a subset of python_tensors (The unused parameters have
// been removed). So we build a map[tensor] = name based on the python trace
// which we then use to build the list of the names of the parameters in
// traced_tensors.
std::vector<std::string>
getParameterNames(const pybind11::dict &python_tensors,
                  const std::vector<at::Tensor> &traced_tensors) {
  // Create a set of the pointers actually in use.
  std::unordered_map<void *, std::size_t> tensor_pointers;

  for (std::size_t i = 0; i < traced_tensors.size(); ++i) {
    tensor_pointers.insert({traced_tensors[i].data_ptr(), i});
  }

  // Get the names of each tensor which hasn't been removed as unused.
  std::vector<std::string> names;
  names.resize(tensor_pointers.size());

  // Extract the python strings into an actual language.
  for (auto element : python_tensors) {
    at::Tensor tensor = element.second.cast<at::Tensor>();

    auto itr = tensor_pointers.find(tensor.data_ptr());

    if (itr != tensor_pointers.end()) {
      std::string option_name = element.first.cast<std::string>();
      names[itr->second] = option_name;
    }
  }

  return names;
}

AnchorList parseAnchors(const py::list &list) {
  AnchorList map;
  for (auto elem : list) {
    auto anchor = elem.cast<py::list>();
    map.push_back(Anchor(anchor[0].cast<std::string>(),
                         anchor[2].cast<std::uint64_t>(),
                         anchor[3].cast<std::uint64_t>()));
  }
  return map;
}

void parseSessionOptionsVoid(const py::dict &opts) {
  SessionOptionsParser{PybindValue(opts)};
}

void buildTensorList(const torch::jit::IValue &value,
                     std::vector<at::Tensor> *tensors,
                     bool allow_tensor_only = false) {
  if (value.isTuple()) {
    ERROR_ON(allow_tensor_only);
    for (auto &element : value.toTuple()->elements()) {
      buildTensorList(element, tensors);
    }
  } else if (value.isList()) {
    ERROR_ON(allow_tensor_only);
    for (const auto element : value.toList()) {
      buildTensorList(element, tensors);
    }
  } else if (value.isTensor()) {
    tensors->push_back(value.toTensor());
  } else {
    ERROR("Unsupported value " << value.tagKind());
  }
}

torch::jit::script::Module *asModule(py::handle h) {
  return reinterpret_cast<torch::jit::script::Module *>(
      pybind11::detail::values_and_holders(
          reinterpret_cast<pybind11::detail::instance *>(h.ptr()))
          .begin()
          ->value_ptr());
}

void identifyZeroSizedTensors(const std::vector<at::Tensor> &tensors) {
  for (const at::Tensor &tensor : tensors) {
    auto sizes = tensor.sizes();
    if (std::any_of(sizes.begin(), sizes.end(),
                    [](auto dim) { return dim == 0; })) {
      std::stringstream err;
      err << "Zero-sized tensors are unsupported (Got shape [";
      for (std::size_t i = 0; i < sizes.size() - 1; i++) {
        err << sizes[i] << ", ";
      }
      err << sizes[sizes.size() - 1] << "]).";
      ERROR(err.str());
    }
  }
}

poptorch::LowerToPopart
lowerToPopartFromDispatch(const pybind11::dict &options,
                          const py::function &attribute_accessor, bool training,
                          const py::dict &opt_dict, const py::list &anchors) {
  auto cleanup = CallOnExit([] {
    // Clear the callbacks after compilation.
    callbacks.clear();
  });

  SessionOptionsParser options_parser{PybindValue(options)};

  AnchorList anchors_list = parseAnchors(anchors);
  std::vector<popart_compiler::Optimizer> optimizers =
      parseOptimizers(opt_dict);

  return lowerToPopartFromDispatch(
      options_parser, training, std::move(anchors_list),
      []() { initCallbackBuffers(); }, std::move(optimizers),
      [&attribute_accessor](const std::string &attributes_id_str) {
        return std::make_unique<PybindValue>(
            attribute_accessor(attributes_id_str));
      },
      callbacks);
}

void mapParamsToNames(const pybind11::tuple &names,
                      const pybind11::tuple &tensors) {
  ERROR_ON(names.size() != tensors.size());
  torch::jit::Stack stack = jit::toTraceableStack(tensors);
  for (uint64_t i = 0; i < names.size(); ++i) {
    const auto name = names[i].cast<std::string>();
    const auto tensor = stack[i].toTensor();
    setParameterName(tensor, name);
  }
}

void setPerReplica(const std::string &param_name, py::handle tensor,
                   int comm_group_type, int shards,
                   int variable_retrieval_mode) {
  at::Tensor t = torch::jit::toTypeInferredIValue(tensor).toTensor();
  setParameterPerReplica(param_name, t, comm_group_type, shards,
                         variable_retrieval_mode);
}

std::string convertToString(const std::vector<char> &str) {
  return std::string(str.data(), str.size());
}

std::vector<char> convertToCharVec(const std::string &str) {
  return std::vector<char>(str.begin(), str.end());
}

pybind11::list toPythonList(std::vector<at::Tensor> &&outputs) {
  pybind11::list pylist(outputs.size());
  for (std::size_t i = 0; i < outputs.size(); ++i) {
    pylist[i] = torch::jit::toPyObject(std::move(outputs[i]));
  }
  return pylist;
}

class Error : public py::object {
public:
  Error() = default;
  Error(handle scope, const char *name, handle base = PyExc_Exception) {
    std::string full_name =
        scope.attr("__name__").cast<std::string>() + std::string(".") + name;
    m_ptr = PyErr_NewException(full_name.c_str(), base.ptr(), nullptr);
    if (hasattr(scope, "__dict__") && scope.attr("__dict__").contains(name)) {
      pybind11::pybind11_fail(
          "Error during initialization: multiple incompatible "
          "definitions with name \"" +
          std::string(name) + "\"");
    }
    scope.attr(name) = *this;
  }

  // Sets the current python myexception to this exception object with the given
  // message
  void setWhat(const std::string &message) { _what = message; }

  const std::string &getWhat() { return _what; }

  void setErrorIndicator() { PyErr_SetString(m_ptr, _what.c_str()); }

  void setMessage(const std::string &message) {
    py::object x = py::cast(message);
    PyObject_SetAttrString(m_ptr, "message", x.ptr());
  }

  void setType(const std::string &type) {
    py::object x = py::cast(type);
    PyObject_SetAttrString(m_ptr, "type", x.ptr());
  }
  void setLocation(const std::string &location) {
    py::object x = py::cast(location);
    PyObject_SetAttrString(m_ptr, "location", x.ptr());
  }

private:
  std::string _what;
};

class RecoverableError : public Error {
public:
  using Error::Error;

  void setRecoveryAction(const std::string &recoveryAction) {
    py::object x = py::cast(recoveryAction);
    PyObject_SetAttrString(m_ptr, "recovery_action", x.ptr());
  }
};
std::unique_ptr<Error> error;
std::unique_ptr<RecoverableError> recoverable_error;
std::unique_ptr<Error> unrecoverable_error;

/*
 * This structure enables poptorch.Error objects to be thrown python-side from
 * both our pybind11 interface and torch's own. Our pybind11 exception handler
 * catches this class specifically, whilst torch's catches any PyTorchError
 * subclass and uses it to deduce the python type using the overridden
 * python_type() method.
 * The function convertToPoptorchExceptionOrRethrow() processes all the
 * exception types we're interested in, extracts detail, and marshals them as
 * instances of PoptorchErrorInfo which is then used to create instances of this
 * class. We put try..catch wrappers round every pybind11 entry point using the
 * macro CATCH_AND_RETHROW_AS_POPTORCH_EXCEPTION and pass them to
 * convertToPoptorchExceptionOrRethrow().
 */
struct PoptorchError : public torch::PyTorchError {
public:
  explicit PoptorchError(const PoptorchErrorInfo &info)
      : torch::PyTorchError(info.long_message), _info(info) {}
  PyObject *python_type() override { return setupPyError(false); }
  void setErrorIndicator() const { setupPyError(true); }

private:
  PyObject *setupPyError(bool set_indicator) const;

public:
  const PoptorchErrorInfo _info;
};

PyObject *PoptorchError::setupPyError(bool set_indicator) const {
  for (int64_t i = _info.stack.size() - 1; i >= 0; --i) {
    poptorch::logging::LogContext::push(_info.stack[i].c_str());
  }
  Error *err = nullptr;
  switch (_info.category) {
  case ErrorCategory::RuntimeRecoverable: {
    recoverable_error->setRecoveryAction(_info.recovery_action);
    err = recoverable_error.get();
    break;
  }
  case ErrorCategory::RuntimeUnrecoverable: {
    err = unrecoverable_error.get();
    break;
  }
  default: {
    err = error.get();
    break;
  }
  }

  err->setType(_info.type);
  err->setMessage(_info.message);
  err->setLocation(_info.location);
  // Note: on Ubuntu 20.04 PyErr_SetString(), i.e setWhat(),
  // needs to be the last call in register_exception_translator()
  err->setWhat(_info.long_message);
  if (set_indicator) {
    err->setErrorIndicator();
  }
  return err->ptr();
}

void doThrowPoptorchError(const PoptorchErrorInfo &info) {
  throw PoptorchError(info);
}
} // namespace

namespace bindings {

void initialiseExceptionHandling(pybind11::handle m) {
  error = std::make_unique<Error>(m, "Error");
  recoverable_error =
      std::make_unique<RecoverableError>(m, "RecoverableError", *error);
  unrecoverable_error =
      std::make_unique<Error>(m, "UnrecoverableError", *error);
  poptorch::setPoptorchErrorThrower(doThrowPoptorchError);
}

void copyWeightsToHostImpl(
    const std::shared_ptr<poptorch::PoplarExecutable> &executable,
    const pybind11::tuple &parameter_names,
    const pybind11::tuple &parameter_tensors) {
  poptorch::logging::Tracepoint tp{"copyWeightsToHost"};
  // Copy the weights or warn if this is before first time compilation.
  if (!executable) {
    logging::log(
        logging::Level::Warn,
        "Call to copyWeightsToHost ignored as model has not been compiled "
        "(PopTorch will compile models on first invocation).");
  } else {
    executable->copyWeightsToHost(
        getParameterBuffers(parameter_names, parameter_tensors));
  }
}

void copyWeightsToDeviceImpl(
    const std::shared_ptr<poptorch::PoplarExecutable> &executable,
    const pybind11::tuple &parameter_names,
    const pybind11::tuple &parameter_tensors) {
  poptorch::logging::Tracepoint tp{"copyWeightsToDevice"};
  // Copy the weights or warn if this is before first time compilation.
  if (!executable) {
    logging::log(
        logging::Level::Warn,
        "Call to copyWeightsToDevice ignored as model has not been compiled "
        "(PopTorch will compile models on first invocation).");
  } else {
    executable->copyWeightsToDevice(
        getParameterBuffers(parameter_names, parameter_tensors));
  }
}

void copyNamedBuffersToDeviceImpl(
    const std::shared_ptr<poptorch::PoplarExecutable> &executable,
    const pybind11::tuple &buffer_names,
    const pybind11::tuple &buffer_tensors) {
  poptorch::logging::Tracepoint tp{"copyNamedBuffersToDevice"};
  // Copy the named buffers or warn if this is before first time compilation.
  if (!executable) {
    logging::log(
        logging::Level::Warn,
        "Call to copyNamedBuffersToDevice ignored as model has not been "
        "compiled (PopTorch will compile models on first invocation).");
  } else {
    executable->copyNamedBuffersToDevice(
        getParameterBuffers(buffer_names, buffer_tensors));
  }
}

std::string
getPopartIR(const std::shared_ptr<poptorch::PoplarExecutable> &executable) {
  ERROR_ON_MSG(!executable, "No built executable");
  return executable->getPopartIR();
}

py::set
getTensorNames(const std::shared_ptr<poptorch::PoplarExecutable> &executable) {
  ERROR_ON_MSG(!executable, "No built executable");
  return py::cast(executable->getTensorNames());
}

void detachFromDevice(
    const std::shared_ptr<poptorch::PoplarExecutable> &executable) {
  poptorch::logging::Tracepoint tp{__FUNCTION__};
  ERROR_ON_MSG(!executable, "No built executable");
  executable->detachFromDevice();
}

void attachToDevice(
    const std::shared_ptr<poptorch::PoplarExecutable> &executable) {
  poptorch::logging::Tracepoint tp{__FUNCTION__};
  ERROR_ON_MSG(!executable, "No built executable");
  executable->attachToDevice();
}

bool isAttachedToDevice(
    const std::shared_ptr<poptorch::PoplarExecutable> &executable) {
  ERROR_ON_MSG(!executable, "No built executable");
  return executable->isAttachedToDevice();
}

void setLogLevel(std::uint64_t level) {
  ERROR_ON(level > static_cast<std::uint64_t>(logging::Level::Off) ||
           level == 5);
  logging::setLogLevel(static_cast<logging::Level>(level));
}

void loadEngineAndConnectStreams(
    const std::shared_ptr<poptorch::PoplarExecutable> &executable) {
  poptorch::logging::Tracepoint tp{__FUNCTION__};
  ERROR_ON_MSG(!executable, "No built executable");
  executable->loadEngineAndConnectStreams();
}

void updateOptimizers(
    const std::shared_ptr<poptorch::PoplarExecutable> &executable,
    const py::dict &optimizer_dict) {
  poptorch::logging::Tracepoint tp{__FUNCTION__};
  ERROR_ON_MSG(!executable, "No built executable");
  // Create an empty optimizer for inference, this will not be applied.
  std::vector<popart_compiler::Optimizer> optimizers =
      parseOptimizers(optimizer_dict);

  executable->updateOptimizers(optimizers);
}

std::vector<pybind11::object>
execute(const std::shared_ptr<poptorch::PoplarExecutable> &executable,
        const pybind11::tuple &inputs) {
  poptorch::logging::Tracepoint tp{__FUNCTION__};
  ERROR_ON_MSG(!executable, "No built executable");
  // Create a jit stack from the incoming pytorch tensors.
  torch::jit::Stack input_stack = jit::toTraceableStack(inputs);

  // And turn convert them into at tensors which we can then resolve the
  // address of.
  std::vector<at::Tensor> input_tensors;
  for (const torch::jit::IValue &value : input_stack) {
    buildTensorList(value, &input_tensors);
  }

  std::vector<at::IValue> output_tensors = executable->run(input_tensors);

  std::vector<pybind11::object> returnee;

  // Reshape the output tensors in the structure expected by the user
  auto tensor_it = output_tensors.begin();
  const auto &output_types = executable->outputTypes();
  auto type_it = output_types.begin();
  ERROR_ON(type_it == output_types.end());

  // First tuple encodes the number of (actual) outputs
  std::uint64_t num_outputs = type_it->num_elements;
  std::function<pybind11::object()> process_output;
  process_output = [&]() -> pybind11::object { // NOLINT
    ERROR_ON_MSG(type_it == output_types.end(), "Invalid OutputTypes object");
    switch (type_it->type) {
    case popart_compiler::OutputElemType::Tensor: {
      ERROR_ON_MSG(tensor_it == output_tensors.end(),
                   "Not enough tensors to unpack");
      auto object = torch::jit::toPyObject(*tensor_it);
      tensor_it++;
      return object;
    }
    case popart_compiler::OutputElemType::Tuple: {
      std::int64_t num_elements = type_it->num_elements;
      pybind11::tuple pytuple(num_elements);
      for (std::int64_t i = 0; i < num_elements; ++i) {
        type_it++;
        pytuple[i] = process_output();
      }
      return std::move(pytuple);
    }
    case popart_compiler::OutputElemType::List: {
      std::int64_t num_elements = type_it->num_elements;
      pybind11::list pylist(num_elements);
      for (std::int64_t i = 0; i < num_elements; ++i) {
        type_it++;
        pylist[i] = process_output();
      }
      return std::move(pylist);
    }
    default:
      ERROR("Unsupported OutputType");
    }
  };

  for (std::uint64_t i = 0; i < num_outputs; ++i) {
    type_it++;
    returnee.push_back(process_output());
  }
  ERROR_ON_MSG(tensor_it != output_tensors.end(),
               "Not all the output tensors were unpacked");

  return returnee;
}

void setRngState(std::shared_ptr<poptorch::PoplarExecutable> &executable,
                 std::uint64_t seed,
                 const std::vector<std::uint32_t> &rng_state) {
  poptorch::logging::Tracepoint tp{__FUNCTION__};
  ERROR_ON_MSG(!executable, "No built executable");
  auto &compiler = executable->getCompiler();
  compiler.setRngState(seed, rng_state);
}

std::uint64_t
getRandomSeed(const std::shared_ptr<poptorch::PoplarExecutable> &executable) {
  poptorch::logging::Tracepoint tp{__FUNCTION__};
  ERROR_ON_MSG(!executable, "No built executable");
  const auto &compiler = executable->getCompiler();
  return compiler.getRandomSeed();
}

std::vector<std::uint32_t>
getRngState(const std::shared_ptr<poptorch::PoplarExecutable> &executable) {
  poptorch::logging::Tracepoint tp{__FUNCTION__};
  ERROR_ON_MSG(!executable, "No built executable");
  const auto &compiler = executable->getCompiler();
  return compiler.getRngState();
}

py::dict readOptimizerState(
    const std::shared_ptr<poptorch::PoplarExecutable> &executable) {
  poptorch::logging::Tracepoint tp{__FUNCTION__};
  py::dict optim_state;
  py::dict state_tensors;
  py::dict param_tensors;
  ERROR_ON_MSG(!executable, "No built executable");
  auto &compiler = executable->getCompiler();
  std::vector<popart_compiler::TensorMetadata> metadata_list =
      compiler.optimizerTensorMetadataList();

  std::vector<void *> host_buffers;
  for (const popart_compiler::TensorMetadata &meta : metadata_list) {
    at::Tensor tensor =
        at::empty({meta.shape}, onnxStrToScalarType(meta.dtype)).contiguous();

    if (meta.num_bytes == -1) {
      // num_bytes == -1 indicates it's an optimiser state tensor (variable)
      host_buffers.push_back(tensor.data_ptr());
      state_tensors[py::cast(meta.id)] = py::cast(tensor);
    } else {
      // Otherwise it's a stream/constant optimiser parameter that we can copy
      // immediately
      std::memcpy(tensor.data_ptr(), meta.data, meta.num_bytes);
      param_tensors[py::cast(meta.id)] = py::cast(tensor);
    }
  }
  compiler.fillHostOptimizerStateTensorData(host_buffers);
  optim_state["ipu_state"] = std::move(state_tensors);
  optim_state["ipu_param"] = std::move(param_tensors);
  return optim_state;
}

void writeOptimizerState(
    const std::shared_ptr<poptorch::PoplarExecutable> &executable,
    const py::dict &optim_state) {
  poptorch::logging::Tracepoint tp{__FUNCTION__};
  ERROR_ON_MSG(!executable, "No built executable");
  auto &compiler = executable->getCompiler();
  std::vector<popart_compiler::TensorMetadata> metadata_list =
      compiler.optimizerTensorMetadataList();

  std::vector<void *> host_buffers;
  auto state = optim_state["ipu_state"];
  auto params = optim_state["ipu_param"];

  for (const popart_compiler::TensorMetadata &meta : metadata_list) {
    if (meta.num_bytes == -1) {
      // num_bytes == -1 indicates it's an optimiser state tensor (variable)
      if (!state.contains(py::cast(meta.id))) {
        logging::log(
            logging::Level::Warn,
            std::string("writeOptimizerState: ignoring missing state " +
                        std::string(meta.id))
                .c_str());
        host_buffers.push_back(nullptr);
        continue;
      }
      at::Tensor tensor = state[py::cast(meta.id)].cast<at::Tensor>();
      host_buffers.push_back(tensor.data_ptr());
    } else {
      if (!params.contains(py::cast(meta.id))) {
        logging::log(
            logging::Level::Warn,
            std::string("writeOptimizerState: ignoring missing parameter " +
                        std::string(meta.id))
                .c_str());
        continue;
      }
      // Otherwise it's a stream/constant optimiser parameter that we can copy
      // immediately
      at::Tensor tensor = params[py::cast(meta.id)].cast<at::Tensor>();
      std::memcpy(meta.data, tensor.data_ptr(), meta.num_bytes);
    }
  }
  compiler.writeDeviceOptimizerStateTensorData(host_buffers);
}

std::vector<pybind11::object>
getTimestamps(const std::shared_ptr<poptorch::PoplarExecutable> &executable) {
  poptorch::logging::Tracepoint tp{__FUNCTION__};
  ERROR_ON_MSG(!executable, "No built executable");
  const auto &compiler = executable->getCompiler();
  popart_compiler::Timestamps ts = compiler.getTimestamps();

  py::list input;
  py::list input_complete;
  py::list output;
  py::list output_complete;

  for (const auto &t : ts.input) {
    input.append(py::cast(t));
  }
  for (const auto &t : ts.input_complete) {
    input_complete.append(py::cast(t));
  }
  for (const auto &t : ts.output) {
    output.append(py::cast(t));
  }
  for (const auto &t : ts.output_complete) {
    output_complete.append(py::cast(t));
  }

  return {input, input_complete, output, output_complete};
}

bool pyIsGraphNondeterministic(py::handle h) {
  auto *module = asModule(h);
  auto forward = module->get_method("forward");
  auto graph_and_tensors =
      torch::jit::LowerGraph(*forward.graph(), module->_ivalue());
  auto graph = graph_and_tensors.first;
  const auto &nodes = graph->nodes();
  return std::any_of(nodes.begin(), nodes.end(), [](const torch::jit::Node *n) {
    return poptorch::isNondeterministic(*n);
  });
}

void saveExecutableToFile(
    const std::shared_ptr<poptorch::PoplarExecutable> &executable,
    const std::string &export_filename) {
  poptorch::logging::Tracepoint tp{__FUNCTION__};
  executable->getCompiler().saveExecutableToFile(export_filename.c_str());
}

void appendPoptorchMetadataToFile(const std::string &serialized_poptorch_data,
                                  const std::string &export_filename) {
  poptorch::logging::Tracepoint tp{__FUNCTION__};
  popart_compiler::Compiler::appendPoptorchMetadataToFile(
      serialized_poptorch_data.c_str(), serialized_poptorch_data.size(),
      export_filename.c_str());
}

uint64_t
cycleCount(const std::shared_ptr<poptorch::PoplarExecutable> &executable) {
  ERROR_ON_MSG(!executable, "No built executable");
  return executable->getCompiler().getCycleCount();
}

py::bytes importPoptorchMetadataFromFile(const std::string &import_filename) {
  poptorch::logging::Tracepoint tp{__FUNCTION__};
  std::vector<char> metadata_buffer =
      popart_compiler::Compiler::importPoptorchMetadataFromFile(
          import_filename.c_str());
  return py::bytes(metadata_buffer.data(), metadata_buffer.size());
}

std::shared_ptr<poptorch::PoplarExecutable> processDispatchAndImportExecutable(
    const pybind11::dict &options, const py::function &attribute_accessor,
    bool is_training, const py::dict &opt_dict, const py::list &anchors,
    const std::string &import_filename) {
  auto lower = lowerToPopartFromDispatch(options, attribute_accessor,
                                         is_training, opt_dict, anchors);
  return lower.loadExecutableFromFile(import_filename);
}
std::shared_ptr<poptorch::PoplarExecutable> compileWithManualTracing(
    const pybind11::dict &options, const py::function &attribute_accessor,
    bool is_training, const py::dict &opt_dict, const py::list &anchors) {
  poptorch::logging::Tracepoint tp{__FUNCTION__};
  logging::log(logging::Level::Debug, "Compile with manual tracing");
  auto lower = lowerToPopartFromDispatch(options, attribute_accessor,
                                         is_training, opt_dict, anchors);
  py::gil_scoped_release release;
  return lower.compile();
}

void setPopartLogLevelUInt(std::uint64_t level) {
  ERROR_ON(level > static_cast<std::uint64_t>(logging::Level::Off) ||
           level == 5);
  popart_compiler::setPopartLogLevel(static_cast<logging::Level>(level));
}

} // namespace bindings

} // namespace poptorch

PYBIND11_MODULE(poptorch_core, m) { // NOLINT
  py::class_<poptorch::PoplarExecutable,
             std::shared_ptr<poptorch::PoplarExecutable>>
      give_me_a_name(m, "InternalPoplarExecutable");
  py::class_<poptorch::CompilerOptions>(m, "CompilerOptions")
      .def(py::init<>())
      .def_property(
          "source_location_excludes",
          [](const poptorch::CompilerOptions &options) {
            std::vector<std::string> excludes;
            std::transform(options.dispatcher.source_location_excludes.begin(),
                           options.dispatcher.source_location_excludes.end(),
                           std::back_inserter(excludes),
                           &poptorch::convertToString);
            return excludes;
          },
          [](poptorch::CompilerOptions &options,
             const std::vector<std::string> &val) {
            options.dispatcher.source_location_excludes.clear();
            std::transform(
                val.begin(), val.end(),
                std::back_inserter(options.dispatcher.source_location_excludes),
                &poptorch::convertToCharVec);
          },
          "When printing the IR all the frames containing one of the excluded"
          "strings will be ignored.\n\n"
          "This is helpful to get the IR to trace back to user code rather"
          "than some function inside a framework.");

  m.def("isGraphNondeterministic",
        PTC(poptorch::bindings::pyIsGraphNondeterministic));
  m.def("saveExecutableToFile", PTC(poptorch::bindings::saveExecutableToFile));
  m.def("appendPoptorchMetadataToFile",
        PTC(poptorch::bindings::appendPoptorchMetadataToFile));
  m.def("cycleCount", PTC(poptorch::bindings::cycleCount));
  m.def("importPoptorchMetadataFromFile",
        PTC(poptorch::bindings::importPoptorchMetadataFromFile));
  m.def("execute", PTC(poptorch::bindings::execute));
  m.def("updateOptimizers", PTC(poptorch::bindings::updateOptimizers));
  m.def("getTimestamps", PTC(poptorch::bindings::getTimestamps));
  m.def("readOptimizerState", PTC(poptorch::bindings::readOptimizerState));
  m.def("setRngState", PTC(poptorch::bindings::setRngState));
  m.def("getRngState", PTC(poptorch::bindings::getRngState));
  m.def("getRandomSeed", PTC(poptorch::bindings::getRandomSeed));
  m.def("writeOptimizerState", PTC(poptorch::bindings::writeOptimizerState));
  m.def("loadEngineAndConnectStreams",
        PTC(poptorch::bindings::loadEngineAndConnectStreams));
  m.def("copyWeightsToDevice_impl",
        PTC(poptorch::bindings::copyWeightsToDeviceImpl));
  m.def("copyNamedBuffersToDevice_impl",
        PTC(poptorch::bindings::copyNamedBuffersToDeviceImpl));
  m.def("copyWeightsToHost_impl",
        PTC(poptorch::bindings::copyWeightsToHostImpl));
  m.def("ipuHardwareVersion",
        PTC(poptorch::popart_compiler::ipuHardwareVersion),
        py::arg("numIpus") = 1);
  m.def("setCustomCodeletsPath",
        PTC(poptorch::popart_compiler::setCustomCodeletsPath));
  m.def("setLogLevel", PTC(poptorch::bindings::setLogLevel),
        py::arg("level") = 2);
  m.def("setPopartLogLevel", PTC(poptorch::bindings::setPopartLogLevelUInt));
  m.def("_getPopartIR", PTC(poptorch::bindings::getPopartIR));
  m.def("_getTensorNames", PTC(poptorch::bindings::getTensorNames));
  m.def("detachFromDevice", PTC(poptorch::bindings::detachFromDevice));
  m.def("attachToDevice", PTC(poptorch::bindings::attachToDevice));
  m.def("isAttachedToDevice", PTC(poptorch::bindings::isAttachedToDevice));
  m.def("registerCPUCallBack", PTC(poptorch::registerCPUCallBack));
  m.def("isAlreadyRegistered", PTC(poptorch::alreadyRegistered));
  m.def("registerBuffersWithCallback",
        PTC(poptorch::registerBuffersWithCallback));
  m.def("_validateOptions", PTC(poptorch::parseSessionOptionsVoid));

  py::enum_<poptorch::TracingMode>(m, "TracingMode")
      .value("PopART", poptorch::TracingMode::POPART)
      .export_values();

  m.def("poptorchAtExit", PTC(poptorch::poptorchAtExit));
  m.def("destroyDispatcher", PTC(poptorch::destroyDispatcher));
  m.def("startDispatch", PTC(poptorch::startDispatch));
  m.def("isCompilingWithDispatcher", PTC(poptorch::isCompilingWithDispatcher));
  m.def("endDispatch", PTC(poptorch::endDispatch));
  m.def("startParametersMove", PTC(poptorch::startParametersMove));
  m.def("endParametersMove", PTC(poptorch::endParametersMove));
  m.def("startOutputsMove", PTC(poptorch::startOutputsMove));
  m.def("endOutputsMove", PTC(poptorch::endOutputsMove));
  m.def("createGraph", PTC(poptorch::createGraph));
  m.def("mapParamsToNames", PTC(poptorch::mapParamsToNames));
  m.def("setPerReplica", PTC(poptorch::setPerReplica));
  m.def("finalizeGraph", PTC(poptorch::finalizeGraph));
  m.def("compileWithManualTracing",
        PTC(poptorch::bindings::compileWithManualTracing));
  m.def("processDispatchAndImportExecutable",
        PTC(poptorch::bindings::processDispatchAndImportExecutable));
  m.def("_throwTestError", PTC(poptorch::popart_compiler::throwTestError));
  m.def("getIpuTensorId", PTC(poptorch::getIpuTensorId));

  poptorch::bindings::initialiseExceptionHandling(m);

  py::enum_<poptorch::popart_compiler::TestErrorType>(m, "TestErrorType")
      .value("Poptorch", poptorch::popart_compiler::TestErrorType::Poptorch)
      .value("Popart", poptorch::popart_compiler::TestErrorType::Popart)
      .value("PopartInternal",
             poptorch::popart_compiler::TestErrorType::PopartInternal)
      .value("Poplibs", poptorch::popart_compiler::TestErrorType::Poplibs)
      .value("PoplarUnrecoverable",
             poptorch::popart_compiler::TestErrorType::PoplarUnrecoverable)
      .value("PoplarUnknown",
             poptorch::popart_compiler::TestErrorType::PoplarUnknown)
      .value(
          "PoplarRecoverableFullReset",
          poptorch::popart_compiler::TestErrorType::PoplarRecoverableFullReset)
      .value("PoplarLinkError",
             poptorch::popart_compiler::TestErrorType::PoplarLinkError);

  py::register_exception_translator(
      [](std::exception_ptr p) { // NOLINT: Don't change 'p' to a const&
        try {
          if (p) {
            std::rethrow_exception(p);
          }
        } catch (const poptorch::PoptorchError &e) {
          e.setErrorIndicator();
        }
      });
}


================================================
FILE: python/profiling.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import os
from typing import Dict
from ._logging import logger

if os.environ.get("PVTI_OPTIONS") is None:
    _pvti_available = False
else:
    try:
        import libpvti as pvti  # type: ignore
        _pvti_available = True
    except ImportError as e:
        logger.info("Tracepoints disabled (Couldn't import libpvti: %s)")
        _pvti_available = False


class Channel:
    """Profiling channel.

    .. note:: If the ``libpvti`` profiling library is not available at runtime
        this class becomes a no-op.

    Example:

    >>> channel = poptorch.profiling.Channel("MyApp")
    >>> with channel.tracepoint("TimeThis"):
    ...     functionToTime()
    >>> channel.instrument(myobj, "methodName", "otherMethod")
    """

    def __init__(self, name):
        if _pvti_available:
            self._tracepoint_prefix = name
            self._channel = _Channels.getOrCreate(name)

    def instrument(self, obj, *methods):
        """Instrument the methods of an object.

        :param obj: Object to instrument
        :param methods: One or more methods to wrap in profiling trace points.
        """
        if _pvti_available:
            pvti.instrument(obj, methods, self._channel)
        return obj

    def tracepoint(self, name):
        """Create a context tracepoint

        >>> with channel.tracepoint("DoingSomething"):
        ...     expensiveCall()

        :param name: Name associated to this tracepoint.
        """
        if _pvti_available:
            tracepoint_name = self._tracepoint_prefix + "." + name
            return pvti.Tracepoint(self._channel, tracepoint_name)
        return _DummyTracepoint()


class _DummyTracepoint:
    """Dummy context used when pvti is not available"""

    def __enter__(self):
        pass

    def __exit__(self, type, value, traceback):
        pass


class _Channels:
    """Singleton library of registered Channels"""
    _channels: Dict[str, 'pvti.Channel'] = {}

    @staticmethod
    def getOrCreate(name):
        if name not in _Channels._channels:
            _Channels._channels[name] = pvti.createTraceChannel(name)
        return _Channels._channels.get(name)


================================================
FILE: python/py.typed
================================================
# Marker file for PEP 561.


================================================
FILE: python/testing.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import torch


# Return true if both the structure and the content of ref and other match
def allclose(ref, other):
    if isinstance(ref, torch.Tensor):
        return torch.allclose(other, ref)
    if isinstance(ref, tuple):
        if not isinstance(other, tuple) or len(ref) != len(other):
            return False
    elif isinstance(ref, list):
        if not isinstance(other, list) or len(ref) != len(other):
            return False
    else:
        assert "%s not supported" % type(ref)
    return all([allclose(r, other[i]) for i, r in enumerate(ref)])


================================================
FILE: requirements.txt
================================================
# IMPORTANT: Keep requirements in sync with ./config.buildenv.py

--extra-index-url https://download.pytorch.org/whl/cpu

torch==2.0.1
torchaudio==2.0.2
torchvision==0.15.2

expecttest==0.1.3
lit==0.11.1
pytest==6.2.5
setuptools==58.0.4
tqdm==4.46.1
transformers==4.12.2
typing-extensions==4.1.1
# Use old version for wheel.pep425tags support (new versions removed it).
wheel<0.35

-r poptorch_geometric/requirements.txt


================================================
FILE: scripts/PopAtenHandlers.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import math
import os
from popgen.api import convert, expand, forward, generate, simplify
from popgen.helpers import alpha, cfloat, cint, clong, clong_list, \
                           cstr, dimension, empty_initializer, output_shape, \
                           output_type, reduction, tensor_list, tensor_long, \
                           tensor_shape, tensor_type
from popgen.operatorfactory import op
from utils import _utils

script = "PopAtenHandlers.py"
output_dir = os.path.join(_utils.sources_dir(),
                          "poptorch/source/popart_canonicalization")

selu_alpha = 1.6732632423543772848170429916717
selu_lambda = 1.0507009873554804934193349852946

# simplification rules
simplify("expm1", lambda x: op.exp(x) - 1.)
simplify("log1p", lambda x: op.log(1. + x))
simplify("reciprocal", lambda x: 1. / x)
simplify("div", lambda x, y: 1. / y * x)

# unary operators
opers = [
    "abs", "acos", "acosh", "asin", "asinh", "atan", "atanh", "ceil", "cos",
    "cosh", "detach", "erf", "exp", "expm1", "floor", "isnan", "log", "log1p",
    "logical_not", "neg", "reciprocal", "relu", "round", "sigmoid", "sin",
    "sinh", "sign", "sqrt", "tan", "tanh"
]

for oper in opers:
    convert(oper, 1)

convert("t", 1, "transpose")
convert("silu", 1, "swish")

expand("erfc", lambda x: 1. - op.erf(x))
expand("log2", lambda x: op.log(x) / math.log(2))
expand("log10", lambda x: op.log(x) / math.log(10))
expand("log_sigmoid", lambda x: op.log(op.sigmoid(x)))
forward("log_sigmoid_forward", "log_sigmoid")
expand(
    "rand", lambda x: op.randomUniform(x, output_shape(), cfloat(1.), cfloat(
        0.), output_type()))
expand(
    "randn", lambda: op.randomNormal(empty_initializer(), output_shape(),
                                     cfloat(0.), cfloat(1.), output_type()))
expand("rsqrt", lambda x: 1. / op.sqrt(x))
expand("selu", lambda x: op.selu(x, cfloat(selu_alpha), cfloat(selu_lambda)))
expand("square", lambda x: x * x)

# binary operators
opers = ["atan2", "div", "fmod", "pow", "remainder"]

for oper in opers:
    convert(oper, 2)

convert("eq", 2, "equal")
convert("gt", 2, "greater")
convert("lt", 2, "less")

convert("logical_and", 2)
convert("logical_or", 2)

expand("cat", lambda x, y: op.concat(tensor_list(x), clong(y)))
expand("elu", lambda x, y, z: op.selu(x, cfloat(y), cfloat(z)))
expand("ge", lambda x, y: x >= y)
expand("le", lambda x, y: x <= y)
expand("leaky_relu", lambda x, y: op.leakyrelu(x, cfloat(y)))
expand("ne", lambda x, y: x != y)
expand("pixel_shuffle", lambda x, y: op.depthtospace(x, clong(y), cstr("CRD")))
expand("reflection_pad1d", lambda x, y: op.reflectionPad(x, clong_list(y)))
expand("replication_pad1d", lambda x, y: op.edgePad(x, clong_list(y)))
expand("rsub", lambda x, y: y - x)


def celu_handler(x, a):
    val = a * (op.exp(x / a) - 1.)
    return op.max(x, 0.) + op.min(0., val)


def hardshrink_handler(x, l):
    return op.where(op.abs(x) > op.abs(l), x, 0.)


def softshrink_handler(x, l):
    r = op.where(x > l, x - l, 0.)
    return op.where(x < -l, x + l, r)


forward("reflection_pad2d", "reflection_pad1d")
forward("replication_pad2d", "replication_pad1d")
forward("replication_pad3d", "replication_pad1d")

# ternary operators
convert("masked_fill", 3, "where", [1, 2, 0])
convert("where", 3)

expand("constant_pad_nd", lambda x, l, c: op.constantPad(
    x, clong_list(l), cfloat(c)))
expand("hardtanh", lambda x, a, b: op.clip(x, cfloat(b), cfloat(a)))
expand(
    "normal_", lambda x, c1, c2: op.randomNormal(x, tensor_shape(x), cfloat(
        c1), cfloat(c2)))
expand("sub", lambda x, y, a: op.sub(x, alpha(y, a)))
expand(
    "uniform_", lambda x, a, b: op.randomUniform(x, tensor_shape(x), cfloat(b),
                                                 cfloat(a)))
expand(
    "topk", lambda x, c, l: op.topk(x, tensor_long(c),
                                    dimension(l, tensor_type(x))))
expand("threshold", lambda x, threshold, val: op.where(x > threshold, x, val))

expand("index_select", lambda x, d, i: op.gather(x, i,
                                                 dimension(d, tensor_type(x))))


# loss handlers
def hinge_embedding_loss_handler(x, y, delta, red):
    red = reduction(clong(red))
    loss = op.where(y.equal(1.), x, 0.)
    loss = op.where(y.equal(-1.), op.max(0., delta - x), loss)
    return op.identityloss(loss, red)


def l1_loss_handler(x, y, red):
    red = reduction(clong(red))
    loss = op.l1loss(x - y, cfloat(1.), red)
    return op.identityloss(loss, cint(2))


def margin_ranking_loss_handler(x1, x2, y, margin, red):
    red = reduction(clong(red))
    loss = op.max(-y * (x1 - x2) + margin, 0.)
    return op.identityloss(loss, red)


def mse_loss_handler(x, y, red):
    red = reduction(clong(red))
    loss = (x - y) * (x - y)
    return op.identityloss(loss, red)


def smooth_l1_loss_handler(x, y, red, beta):
    red = reduction(clong(red))
    delta = op.abs(x - y)
    loss = op.where(delta < beta, 0.5 * delta * delta / beta,
                    delta - 0.5 * beta)
    return op.identityloss(loss, red)


def soft_margin_loss_handler(x, y, red):
    red = reduction(clong(red))
    loss = op.log(1. + op.exp(-y * x))
    return op.identityloss(loss, red)


# everything else
# NOTE: alpha and beta are swapped with a gemm
expand(
    "addmm", lambda x, y, z, beta, alpha: op.gemm(y, z, x, cfloat(
        alpha), cfloat(beta), clong(0), clong(0)))

generate(script, "c10::aten", output_dir + "/AtenHandlers.gen.cpp", globals())


================================================
FILE: scripts/PopParse.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import enum
import argparse
import logging
import os
import re
import sys

import clang.cindex
from popgen import onnx
from utils import _utils

logger = logging.getLogger("PopParse")
_utils.set_logger(logger)

parser = argparse.ArgumentParser()
parser.add_argument("-c",
                    "--clang",
                    type=str,
                    help="Manually set path to clang headers")
parser.add_argument("-D",
                    "--debug",
                    action='store_true',
                    help="Enable debug printing")

args = parser.parse_args()

poplar_dir = onnx.find_poplar_includes()
popart_dir = onnx.find_popart_includes()
onnx.init(popart_dir, args.clang, args.debug)
jsonOutput = onnx.parse()

logging_level = logging.DEBUG if args.debug else logging.INFO
logging.basicConfig(level=logging_level)

# List of options which cannot be resolved with clang, e.g. referring to values
# in external sources
options_not_resolved = ["defaultPrefetchBufferingDepth"]

# List of SessionOptions attributes PopTorch decided to not support
options_not_handled = [
    "bufferingDepthMap",
    "developerSettings",
    "prefetchBufferingDepthMap",
    "matmulOptions",
    "tensorLocationSettingsOverride",
    "autodiffSettings",
    "scheduleNonWeightUpdateGradientConsumersEarly",
    "matmulOptions",
    # Handled by PopTorch but not detected by this parser:
    "activationTensorLocationSettings",
    "replicatedCollectivesSettings",
    "automaticLossScalingSettings",
    "weightTensorLocationSettings",
    "optimizerStateTensorLocationSettings",
    "accumulatorTensorLocationSettings",
    "replicatedGraphCount",
    "accumulationReductionType",
    "executionPhaseSettings",
    "accumulateOuterFragmentSettings",
    "batchSerializationSettings",
    "automaticLossScalingSettings",
    "autodiffSettings",
    "_enableRngStateManagement",
    "createImplicitPipeliningFwdOnlyProgram",
]


class OptionType(enum.IntEnum):
    Bool = 0
    Int = 1
    Float = 2
    String = 3
    Container = 4
    Enum = 5
    Object = 6


# check container_options
def parse_session_options(root_node):  # pylint: disable=too-many-statements
    # Build the list of options handled by PopTorch:
    handled = {}
    checks = {
        r".*container_options, \"(.*)\",.*": OptionType.Container,
        r" *ADD_POPART_ENUM_OPTION\(([^,]+),.*": OptionType.Enum,
        r" *ADD_POPART_STRING_OPTION\((.*)\).*": OptionType.String,
        r" *ADD_POPART_UINT64_OPTION\((.*)\).*": OptionType.Int,
        r" *ADD_POPART_BOOL_OPTION\((.*)\).*": OptionType.Bool,
        r" *ADD_POPART_DOUBLE_OPTION\((.*)\).*": OptionType.Float
    }

    for line in open(
            os.path.join(_utils.sources_dir(), "popart_compiler", "source",
                         "SessionOptions.cpp"), "r"):
        for expr, type in checks.items():
            m = re.match(expr, line)
            if m:
                handled[m.group(1)] = type
                break

    def find_session_options(node):
        if node.kind == clang.cindex.CursorKind.STRUCT_DECL and \
                node.spelling == "SessionOptions":
            return node

        for c in node.get_children():
            n = find_session_options(c)
            if n:
                return n
        return None

    def get_child(parent, child_type):
        child = None
        for c in parent.get_children():
            if c.kind == child_type:
                assert child is None, (
                    f"More than one child of "
                    f"{parent.spelling} has type {str(child_type)}")
                child = c
        return child

    opts = find_session_options(root_node)
    expected = {}
    # Build the list of attributes in Popart's SessionOptions
    for c in opts.get_children():
        if c.kind != clang.cindex.CursorKind.FIELD_DECL:
            continue

        if c.spelling in options_not_resolved:
            continue

        children = list(c.get_children())

        # deal with CursorKind.UNEXPOSED_REF
        # this shows up when there is an implicit cast between the literal
        # initializer and the storage type of the structure member
        uc = get_child(c, clang.cindex.CursorKind.UNEXPOSED_EXPR) or c

        if (get_child(c, clang.cindex.CursorKind.CXX_BOOL_LITERAL_EXPR) or
                get_child(uc, clang.cindex.CursorKind.CXX_BOOL_LITERAL_EXPR)):
            expected[c.spelling] = OptionType.Bool
        elif (get_child(c, clang.cindex.CursorKind.INTEGER_LITERAL)
              or get_child(uc, clang.cindex.CursorKind.INTEGER_LITERAL)):
            expected[c.spelling] = OptionType.Int
        elif (get_child(c, clang.cindex.CursorKind.FLOATING_LITERAL)
              or get_child(uc, clang.cindex.CursorKind.FLOATING_LITERAL)):
            expected[c.spelling] = OptionType.Float
        else:
            opt_type = get_child(c, clang.cindex.CursorKind.TEMPLATE_REF)
            if opt_type:
                if opt_type.spelling in ["set", "vector", "map"]:
                    expected[c.spelling] = OptionType.Container
                elif opt_type.spelling in ["function"]:
                    continue
                else:
                    assert False, f"Template not supported {opt_type.spelling}"
            else:
                opt_type = get_child(c, clang.cindex.CursorKind.TYPE_REF)
                assert opt_type, (f"Can't find type of {c.spelling}: "
                                  f"{[str(d.kind) for d in children]}")
                if opt_type.spelling in ("std::string",
                                         "std::__cxx11::string"):
                    expected[c.spelling] = OptionType.String
                elif opt_type.spelling == \
                        "class popart::SessionOptions::NumIOTiles":
                    expected[c.spelling] = OptionType.Int
                elif opt_type.spelling.startswith("enum "):
                    expected[c.spelling] = OptionType.Enum
                elif opt_type.spelling.startswith("struct "):
                    expected[c.spelling] = OptionType.Object
                elif opt_type.spelling.startswith("class "):
                    expected[c.spelling] = OptionType.Object
                elif opt_type.spelling == "int64_t" or \
                     opt_type.spelling == "size_t":
                    expected[c.spelling] = OptionType.Int
                else:
                    assert False, f"Type not supported {opt_type.spelling}"

    missing_mismatched = []
    for opt, type in expected.items():
        if opt in options_not_handled:
            continue
        if opt not in handled:
            missing_mismatched.append(
                f"Option {opt} not handled by PopTorch Type: {str(type)}. You"
                " need to add the relevant macro in SessionOptions.cpp or to"
                " options_not_handled in this script.")
        elif handled[opt] != type:
            missing_mismatched.append(
                (f"Type mismatch for option {opt}: Popart type {str(type)} "
                 f"PopTorch: {str(handled[opt])}"))
    assert not missing_mismatched, "\n".join(missing_mismatched)


index = clang.cindex.Index.create()
session_file = os.path.join(popart_dir, "popart", "sessionoptions.hpp")
tu = index.parse(session_file,
                 args=[
                     "-std=c++14",
                     "-I" + popart_dir,
                     "-I" + poplar_dir,
                     "-DONNX_NAMESPACE=onnx",
                 ])

parse_session_options(tu.cursor)

# `prelu' is supported, PyTorch's definition requires a reshape before passing
# to the ONNX op.
UnsupportedOps = ["abort", "ctcloss", "gru", "rnn", "tensorremap", "prelu"]

## Implicit cast support
# Casting on all args
# yapf: disable
CastingOps = [
    "add",
    "atan2",
    "bitshift",
    "call",
    "clip",
    "concat",
    "convtranspose",
    "div",
    "dynamicadd",
    "dynamiczero",
    "equal",
    "fmod",
    "gemm",
    "greater",
    "gru",
    "instancenormalization",
    "less",
    "logical_and",
    "logical_or",
    "logical_xor",
    "lstm",
    "matmul",
    "max",
    "maxroipool",
    "mean",
    "min",
    "mod",
    "mul",
    "pow",
    "range",
    "remainder",
    "rnn",
    "scan",
    "sequenceconstruct",
    "sub",
    "sum",
]
# yapf: enable
# Also Einsum, GreaterOrEqual, LessOrEqual

CastingExceptFirstArgsOps = ["where"]
CastingExceptSecondArgsOps = [
    "dequantizelinear", "scatterelements", "scatternd"
]
# Also Pad but only after >= 11
CastingExceptThirdArgsOps = ["roialign"]
CastingExceptFourthFifthArgsOps = []

# Implicit casting ops not in these catagories:
# QLinearConv, QLinearMatMul

# All implicitly casting ops produce an output the same as the promoted type
# except those which always return bools, floats (in onc case) and the following
CastingDifferentOutput = ["sequenceconstruct", "call"]

CastingAlwaysBoolOutput = [
    "equal", "greater", "less", "logical_and", "logical_not", "logical_or",
    "logical_xor"
]

CastingAlwaysFloatOutput = ["dequantizelinear"]

CastingAlwaysIntOutput = ["convinteger", "matmulinteger"]

## Non implicit-casting type support

# yapf: disable
OutputTypeSameAsFirstInput = [
    "_ctcloss",
    "abs",
    "acos",
    "acos",
    "acosh",
    "asin",
    "asinh",
    "atan",
    "atanh",
    "averagepool",
    "batchnormalization",
    "bitwiseand",
    "bitwiseor",
    "bitwisexor",
    "bitwisexnor",
    "bitwisenot",
    "bitwisexor",
    "ceil",
    "celu",
    "compress",
    "concat",
    "conv",
    "cos",
    "cosh",
    "cumsum",
    "depthtospace",
    "det",
    "detach",
    "dropout",
    "dynamicupdate",
    "dynamicslice",
    "einsum",
    "elu",
    "erf",
    "exp",
    "expand",
    "expm1",
    "flatten",
    "floor",
    "fmod",
    "gather",
    "gatherelements",
    "gathernd",
    "gelu",
    "geluerf",
    "globalaveragepool",
    "globallppool",
    "globalmaxpool",
    "groupnormalization",
    "hardmax",
    "hardsigmoid",
    "hardswish",
    "identity",
    "identityloss",
    "l1loss",
    "leakyrelu",
    "log",
    "log1p",
    "logical_not",
    "logsoftmax",
    "lpnormalization",
    "lppool",
    "lrn",
    "maxpool",
    "maxunpool",
    "meanvariancenormalization",
    "neg",
    "nllloss",
    "nop",
    "onehot",
    "pad",
    "printtensor",
    "range",
    "reciprocal",
    "reducel1",
    "reducel2",
    "reducelogsum",
    "reducelogsumexp",
    "reducemax",
    "reducemean",
    "reducemin",
    "reduceprod",
    "reducesum",
    "reducesumsquare",
    "relu",
    "remainder",
    "replicatedallreduce",
    "reshape",
    "resize",
    "reverse",
    "reversesequence",
    "roialign",
    "round",
    "scale",
    "scaledadd",
    "scatter",
    "scatterreduce",
    "selu",
    "sequenceerase",
    "shapeddropout",
    "shrink",
    "sigmoid",
    "sign",
    "sin",
    "sinh",
    "slice",
    "softmax",
    "softplus",
    "softsign",
    "spacetodepth",
    "split",
    "sqrt",
    "squeeze",
    "stringnormalizer",
    "subsample",
    "swish",
    "tan",
    "tanh",
    "thresholdedrelu",
    "tile",
    "transpose",
    "unique",
    "unsqueeze",
    "upsample",
]
# yapf: enable

FirstOutputTypeSameAsFirstInputButSecondAlwaysInt = ["topk", "reducemedian"]

OutputTypeSameAsThirdInput = ["onehot"]

OutputTypeAlwaysBool = [
    "isinf", "isnan", "logical_and", "logical_not", "logical_or", "logical_xor"
]

OutputTypeAlwaysFloat = ["tfidfvectorizer"]

OutputTypeAlwaysInt32 = [
    "argmax", "argmin", "isinf", "isnan", "nonmaxsuppression", "nonzero",
    "shape", "size"
]

OutputTypeAlwaysUint8 = [
    "dynamicquantizelinear", "quantizelinear", "qlinearconv", "qlinearmatmul"
]

OutputTypeAsDtype = [
    "cast", "eyelike", "multinomial", "randomnormal", "randomuniform"
]

OutputTypeAsDtypeOrAsPromoted = ["randomnormallike", "randomuniformlike"]

OutputTypeVariable = [
    "concatfromsequence", "constant", "constantofshape", "loop", "multinomial",
    "sequenceat", "sequentempty", "sequenceinsert ", "splittosequence"
]

MultipleOutputsOps = {
    "gru": "2",
    "lstm": "2",
    "rnn": "2",
    "split": "num_outputs",
    "topk": "2",
    "reducemedian": "2",
    "batchnormalization": "num_node_outputs",
}

ExtraArgumentOps = {
    "batchnormalization": ["unsigned int num_node_outputs"],
}

CXXTypeToTypeClass = {
    # Scalar integers
    "int64_t": "INT",
    "int": "INT",
    "bool": "INT",
    "unsigned int": "INT",
    "popart::ReductionType": "INT",
    "popart::ScatterReduction": "INT",
    "nonstd::optional<int64_t>": "INT",
    "nonstd::optional<int>": "INT",
    "Attributes::Int": "INT",

    # Floats
    "float": "FLOAT",
    "nonstd::optional<float>": "FLOAT",

    # Non-scalar floats
    "std::vector<float>": "FLOAT_VEC",

    # Non-scalar integers.
    "std::vector<int64_t>": "INT_VEC",
    "nonstd::optional<std::vector<int64_t> >": "INT_VEC",
    "Attributes::Ints": "INT_VEC",

    # String
    "char": "CHAR",
    "std::string": "STRING",
    "std::vector<std::string>": "STRING_VEC",

    # Debug context
    "popart::DebugContext": "DEBUG_CONTEXT"
}


# Cleans up raw C++ type to remove reference or const qualifiers
def clean(cxxType):
    return cxxType.replace("&", "").replace("const", "").strip().rstrip()


# Convert the raw C++ type parsed from the header into the macro type.
def toType(cxxType):

    cleaned = clean(cxxType)

    if cleaned in CXXTypeToTypeClass:
        return CXXTypeToTypeClass[cleaned]

    logger.debug("toType: Unknown cxxType=%s / cleaned=%s", cxxType, cleaned)

    # Soft fail as it isn't unexpected for some popart functions to be unsupported right now.
    return "UNKNOWN"


CXX_TYPE_CONV_TABLE = {
    "nonstd::optional<int>": "std::int32_t",
    "nonstd::optional<int64_t>": "std::int32_t",
    "popart::ReductionType": "std::int32_t",
    "popart::ScatterReduction": "std::int32_t",
    "nonstd::optional<float>": "float",
    "nonstd::optional<std::vector<int64_t>>": "std::vector<int64_t>",
    "Attributes::Ints": "std::vector<int64_t>",
    "Attributes::Int": "std::int32_t",
    "std::vector<float>": "std::vector<double>"
}

CXX_NON_CONV_TYPES = [
    "char", "bool", "float", "int", "int64_t", "unsigned int", "std::string",
    "std::vector<int64_t>", "std::vector<std::string>", "popart::DebugContext"
]


# Convert from the popart header types into normal C++ types that can be used by pytorch.
def convertCxxConvert(cxxType_orig):
    cxxType = cxxType_orig.replace("&", "")
    cxxType = cxxType.replace("const ", "const[preserved_space]")
    cxxType = cxxType.replace("unsigned const", "const unsigned")

    # Remove any whitespace but keep "const" and "unsigned" safe
    cxxType = cxxType.replace("const ", "const[preserved_space]")
    cxxType = cxxType.replace("unsigned ", "unsigned[preserved_space]")
    cxxType = "".join(cxxType.split())
    cxxType = cxxType.replace("[preserved_space]", " ")

    if cxxType in CXX_TYPE_CONV_TABLE:
        return CXX_TYPE_CONV_TABLE[cxxType]

    # Most types won't need processing
    if cxxType in CXX_NON_CONV_TYPES:
        return cxxType_orig

    # Handle const
    if cxxType.startswith("const "):
        non_const_type = cxxType[len("const "):]

        if non_const_type in CXX_TYPE_CONV_TABLE:
            # const is dropped for legacy
            return CXX_TYPE_CONV_TABLE[non_const_type]

        if non_const_type in CXX_NON_CONV_TYPES:
            return cxxType_orig

    # Error on unknown types
    print(f"Unknown type: {cxxType}")
    sys.exit(1)


def attrTypeGetter(ty):
    typemap = {
        "CHAR": "i",
        "INT": "i",
        "INT_VEC": "is",
        "FLOAT": "f",
        "FLOAT_VEC": "fs",
        "STRING": "s",
        "STRING_VEC": "ss",
        "DEBUG_CONTEXT": "x",
    }
    assert ty in typemap, "Invalid type: " + ty
    return typemap[ty]


def addCastingOptStr(name):
    if name in CastingOps:
        return "ImplicitCast::All"
    if name in CastingExceptFirstArgsOps:
        return "ImplicitCast::ExceptFirst"
    if name in CastingExceptSecondArgsOps:
        return "ImplicitCast::ExceptSecond"
    if name in CastingExceptThirdArgsOps:
        return "ImplicitCast::ExceptThird"
    if name in CastingExceptFourthFifthArgsOps:
        return "ImplicitCast::ExceptFourthFifth"
    return "ImplicitCast::None"


def addOutputTypeStr(name):  # pylint: disable=too-many-return-statements
    if name in CastingAlwaysBoolOutput or name in OutputTypeAlwaysBool:
        return "OutputType::AlwaysBool"
    if name in CastingAlwaysFloatOutput or name in OutputTypeAlwaysFloat:
        return "OutputType::AlwaysFloat"
    if name in CastingAlwaysIntOutput or name in OutputTypeAlwaysInt32:
        return "OutputType::AlwaysInt"
    if any(name in n
           for n in (CastingOps, CastingExceptFirstArgsOps,
                     CastingExceptSecondArgsOps, CastingExceptThirdArgsOps)):
        return "OutputType::AsImplicitCastPromoted"
    if name in OutputTypeSameAsFirstInput:
        return "OutputType::AsFirstInput"
    if name in FirstOutputTypeSameAsFirstInputButSecondAlwaysInt:
        return "OutputType::FirstAsFirstInputSecondAlwaysInt"
    if name in OutputTypeSameAsThirdInput:
        return "OutputType::AsThirdInput"
    if name in OutputTypeAlwaysUint8:
        return "OutputType::AlwaysUint8"
    if name in OutputTypeAsDtype:
        return "OutputType::AsDtype"
    if name in OutputTypeAsDtypeOrAsPromoted:
        return "OutputType::AsDtypeOrAsPromoted"
    if name in OutputTypeVariable:
        return "OutputType::Unknown"
    print(f"Missing type spec for: {name}")
    return "OutputType::Unknown"


macroFile = ""

headerStubs = ""

cxxFile = ""

classes = []
for classname in jsonOutput:
    classes.append(classname)
classes.reverse()

for opset in classes:
    macroFile += "// Ops from %s\n" % opset
    for name in jsonOutput[opset]:
        if name in UnsupportedOps:
            continue

        logger.debug("Generating code for %s::%s", opset, name)
        # Generate the macro
        opDecl = "OP_DECL("

        funcName = name.capitalize()
        opDecl += "popart, " + name + ", " + name

        if opset.startswith("AiOnnxOpset"):
            opDecl += ", AiOnnxOpset11." + name
        else:
            opDecl += ", " + opset + "." + name

        argVector = ""
        bodyArgVector = ""

        earlyExit = True
        args = jsonOutput[opset][name]["args"]
        for arg in args:
            # Skip the first args and also the "name" arg.
            if arg["name"] == "args":
                # Guarantee we are working with an op which takes in popart tensors as 0th argument.
                earlyExit = False
                continue

            macroType = toType(arg["type"])

            if macroType == "UNKNOWN":
                logger.info("Skipping OP: %s"
                            " due to parse failure on %s", name, str(arg))
                earlyExit = True
                break

            if arg["name"] != "debugContext":
                argVector += "ARG(" + macroType + "," + arg["name"] + ") "

            if any(arg["type"].endswith(s)
                   for s in ["ReductionType", "ScatterReduction"]):
                bodyArgVector += f"BODY_ARG(static_cast<{clean(arg['type'])}>("\
                + arg["name"] + ")) "
            elif arg["name"] == "debugContext":
                bodyArgVector += "BODY_ARG(DEBUG_CONTEXT(\"" + funcName + "\"))"
            else:
                bodyArgVector += "BODY_ARG(" + arg["name"] + ") "

        if earlyExit:
            continue

        if argVector == "":
            argVector = "NONE"

        if bodyArgVector == "":
            bodyArgVector = "NONE"

        opDecl += ", " + argVector
        opDecl += ", " + bodyArgVector

        macroFile += opDecl + ")\n"

        header = "torch::jit::Node* "

        header += "create" + funcName + "(torch::jit::Graph *graph,  const " \
            "std::vector<torch::jit::Value *>& args"

        cppFile = " torch::jit::Node *new_node = createAndInsertNode(graph, " \
               "symbols::popart::" + name + ", args"

        cppFile += f", {addCastingOptStr(name)}, {addOutputTypeStr(name)}"

        if name in MultipleOutputsOps:
            cppFile += ", %s" % MultipleOutputsOps[name]
        cppFile += ");\n"

        args = jsonOutput[opset][name]["args"]
        for arg in args:
            # Skip the first args
            if arg["name"] == "args":
                continue

            attr = attrTypeGetter(toType(arg["type"]))
            if attr == "x":
                continue

            header += "," + convertCxxConvert(arg["type"]) + " " + arg["name"]


            cppFile += "new_node->" + attr + "_(c10::Symbol::attr("\
                "\"" + arg["name"] + "\")," + arg["name"] + ");\n"

        if name in ExtraArgumentOps:
            header += ", " + ", ".join(ExtraArgumentOps[name])

        if name in OutputTypeAsDtype:
            cppFile += "setNodeOutputsTypes(new_node, ImplicitCast::None, "
            cppFile += "OutputType::AsDtype);\n"
        if name in OutputTypeAsDtypeOrAsPromoted:
            cppFile += "setNodeOutputsTypes(new_node, ImplicitCast::All, "
            cppFile += "OutputType::AsDtypeOrAsPromoted);\n"

        cppFile += "return new_node;\n"

        cppFile = header + ") {\n" + cppFile + "}"

        header += ");"

        headerStubs += header + "\n"

        cxxFile += cppFile + "\n"

autoComment = """// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
// Auto generated file, do not modify
// Run `python3 scripts/PopParse.py` to regenerate
// clang-format off
"""

with open(
        os.path.join(_utils.sources_dir(), 'popart_compiler', 'include',
                     'popart_compiler', 'CompilerOperationMacros.inc.hpp'),
        'w') as f:
    print(autoComment, file=f)
    print(macroFile, file=f)

with open(
        os.path.join(_utils.sources_dir(), 'poptorch', 'source', 'include',
                     'poptorch', 'CompilerOps.inc.hpp'), 'w') as f:
    print(autoComment, file=f)
    print(headerStubs, file=f)

with open(
        os.path.join(_utils.sources_dir(), 'poptorch', 'source',
                     'CompilerOps.cpp.inc'), 'w') as f:
    print(autoComment, file=f)
    print(cxxFile, file=f)


================================================
FILE: scripts/PopTorchHandlers.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import os

from popgen.api import expand, convert, generate
from popgen.helpers import cint, clong, cstr, tensor_list
from popgen.values import OriginalNode
from popgen.operatorfactory import op
from utils import _utils

script = "PopTorchHandlers.py"
output_dir = os.path.join(_utils.sources_dir(),
                          "poptorch/source/popart_canonicalization")

convert("recomputation_checkpoint", 1, "recomputationCheckpoint")
convert("update_param_inplace", 2, "copyvarupdate")

expand("begin_ipu_block", lambda x, y, z: op.beginIpuBlock(
    clong(x), clong(y), clong(z)))

expand("internal_cast", lambda tensor, dtype: op.internalCast(
    tensor, cstr(dtype)))
expand("call_cpu_op", lambda x, s: op.callCpuOp(tensor_list(x), cstr(s),
                                                OriginalNode()))
expand("identity_loss", lambda x, r: op.identityloss(x, cint(r)))
expand("optimizer_group", lambda x, l: op.optimizerGroup(
    clong(x), tensor_list(l)))
expand(
    "set_matmul_serialization", lambda x, s, a, b: op.setMatMulSerialization(
        x, cstr(s), clong(a), cint(b)))
expand("start_for_loop", op.startForLoop)
expand(
    "end_for_loop", lambda output, inputs, trip_count: op.endForLoop(
        output, inputs, clong(trip_count)))

expand("start_if_block", op.startIfBlock)
expand("start_else_block", op.startElseBlock)
expand("end_if_block", op.endIfBlock)

expand("nop", op.nop)

# These are graph annotations: they don't take any arguments and don't return
# anything: we just want to pass them through to the lowering stage.
expand("end_ipu_block", op.passThrough)
expand("begin_multi_conv", op.passThrough)
expand("pop_name_scope", op.passThrough)
expand("end_cpu_op", op.passThrough)

generate(script, "symbols::poptorch", output_dir + "/PoptorchHandlers.gen.cpp",
         globals())


================================================
FILE: scripts/__init__.py
================================================


================================================
FILE: scripts/apply_linters.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.
from typing import Tuple

import argparse
import collections
import difflib
import enum
import hashlib
import json
import logging
import os
import pathlib
import re
import sys
import tempfile
import time
import packaging.version
import yaml

from utils import _utils

logger = logging.getLogger("apply_linters")
_utils.set_logger(logger)

yapf_flags = "--style='{based_on_style: pep8}'"
cpp_lint_disabled = [
    "runtime/string", "runtime/references", "build/c++11",
    "build/header_guard", "whitespace/comments", "whitespace/indent"
]


class OutputProcessor:
    def __call__(self, raw_output: str, returncode: int) -> Tuple[str, int]:
        raise NotImplementedError()


class SaveOutput(OutputProcessor):
    def __init__(self):
        self.output = ""

    def __call__(self, raw_output: str, returncode: int) -> Tuple[str, int]:
        self.output = raw_output
        return raw_output, returncode


class GitStrategy(enum.Enum):
    Master = "master"  # Files modified / added between HEAD and origin/master
    Head = "head"  # Files modified / added in the last commit
    Diff = "diff"  # Files modified / added but not commited
    All = "all"  # All files tracked by git
    PreCommit = "pre-commit"  # pre-commit is like "master" except it takes
    # precedence over the files provided on the
    # command line.


class ILinterFamily:
    """Regroup the linters running on the same types of files (e.g cpp or py)
    """

    def __init__(self, supported_extensions, linters,
                 excluded_extensions=None):
        """
        :param supported_extensions: Array of extensions supported by the
            linters (e.g ["hpp","cpp"])
        :param linters: list of linters to run for the matching files
        :param excluded_extensions: Optional list of extensions to exclude
        """
        self._linters = linters
        # ["hpp","cpp"] -> ".*\.(hpp|cpp)$"
        self._supported = re.compile(r".*\.(%s)$" %
                                     '|'.join(supported_extensions))
        if excluded_extensions:
            self._excluded = re.compile(r".*\.(%s)$" %
                                        '|'.join(excluded_extensions))
        else:
            self._excluded = None
        self.first_lint = True

    def gen_lint_commands(self, filename, autofix):
        if not re.match(self._supported, filename):
            logger.debug("%s didn't match %s", filename, self._supported)
            return []
        if self._excluded and re.match(self._excluded, filename):
            logger.debug("%s matched exclusion %s", filename, self._excluded)
            return []
        if self.first_lint:
            # Check all the linters are correctly installed
            self.first_lint = False
            all_valid = all(linter.check_version() for linter in self._linters)
            if not all_valid:
                print("\nERROR: You need a valid PopTorch buildenv to run "
                      "the linters:")
                print("- create a buildenv using scripts/create_buildenv.py")
                print(
                    "- activate the environment: source activate_buildenv.sh")
                print("- configure your PopTorch build: cmake "
                      "../poptorch -DPOPLAR_SDK=...")
                sys.exit(1)

        return [
            linter.gen_lint_command(filename, autofix)
            for linter in self._linters
            if linter.is_enabled(filename, autofix)
        ]


class CppLinters(ILinterFamily):
    def __init__(self):
        super().__init__(["hpp", "cpp"],
                         excluded_extensions=["inc.hpp", "inc.cpp"],
                         linters=[ClangTidy(), ClangFormat()])


class PyLinters(ILinterFamily):
    def __init__(self):
        super().__init__(["py"], linters=[Pylint(), Yapf()])

    def is_enabled(self, filename, autofix):  # pylint: disable=unused-argument
        # Don't run PyLint on the buildenv config files
        return re.match(r".*\.buildenv\.py$", filename) is None


class ILinter:
    """Base class for all the linters"""

    def gen_lint_command(self, filename, autofix):
        """Create one or more commands to lint the given file"""
        raise RuntimeError("Must be implemented by child class")

    def check_version(self):
        """Check the linter is installed. (Called only once)"""
        raise RuntimeError("Must be implemented by child class")

    def is_enabled(self, filename, autofix):  # pylint: disable=unused-argument
        """Should the linter run for this given file?"""
        return True


class ProcessManager:
    _manager = None

    @staticmethod
    def create(max_num_proc=0):
        assert ProcessManager._manager is None
        ProcessManager._manager = ProcessManager(max_num_proc)

    @staticmethod
    def get():
        if ProcessManager._manager is None:
            ProcessManager.create()
        return ProcessManager._manager

    def __init__(self, max_num_proc):
        self.max_num_proc = max_num_proc
        self.queue = []
        self.running = []
        self.num_running = 0

    def enqueue(self, create_proc_fn):
        if self.max_num_proc == 0:
            create_proc_fn()
            return

        self.queue.append(create_proc_fn)
        self.update()

    def update(self):
        def _is_running(proc):
            """Update num_running when a process just returned
            """
            if proc.is_running():
                return True
            self.num_running -= 1
            logger.debug("Process completed, %d/%d processes in use",
                         self.num_running, self.max_num_proc)
            return False

        # Check the status of all the running processes
        self.running = [p for p in self.running if _is_running(p)]

        # Start new processes if slots are available
        while self.queue and self.num_running < self.max_num_proc:
            self.running.append(self.queue[0]())
            self.queue = self.queue[1:]
            self.num_running += 1
            logger.debug("Process started, %d/%d processes in use",
                         self.num_running, self.max_num_proc)


class Command:
    """Asynchronously run a command in a sub shell"""

    def __init__(self,
                 *cmd,
                 stop_on_error=True,
                 print_output=True,
                 output_processor=None,
                 name=None,
                 print_output_on_error=True):
        # Stop on error
        self.cmd = "set -e;" if stop_on_error else ""
        self.cmd += " ".join(cmd)
        self.output_processor = output_processor
        self.print_output = print_output
        self.proc = None
        self.output = ""
        self.name = name or cmd[0]
        self.print_output_on_error = print_output_on_error

    def start(self):
        ProcessManager.get().enqueue(self._create_proc)

    def _create_proc(self):
        assert self.proc is None, "Process already started"
        self.output = ""

        def append_to_output(line):
            self.output += line + "\n"

        # We make sure that the PYTHONPATH is clear because we do not want the
        # linter to undertake run-time inspection of the poptorch module.
        new_env = os.environ.copy()
        new_env["PYTHONPATH"] = ""
        if "CPATH" in new_env:
            del new_env["CPATH"]

        self.proc = _utils.Process([self.cmd],
                                   redirect_stderr=True,
                                   env=new_env,
                                   stdout_handler=append_to_output)
        return self.proc

    def is_running(self):
        return self.proc is None or self.proc.is_running()

    def wait(self):
        while self.proc is None:
            ProcessManager.get().update()
            time.sleep(1)
        returncode = self.proc.wait()
        output = self.output

        logger.debug("Command %s returned with %d", self.name, returncode)
        if self.output_processor:
            output, returncode = self.output_processor(output, returncode)
        if self.print_output_on_error and returncode:
            print(f"{self.name} failed with exit code {returncode}")
            print("Output:")
            print(output)
        elif self.print_output and output:
            print(f"Output of {self.name}:")
            print(output)
        return returncode

    def run(self):
        self.start()
        return self.wait()


class CondaCommand(Command):
    """A command which will activate a Conda buildenv before running"""
    activate_cmd = None

    def __init__(self, *cmd, name=None, **kwargs):
        if CondaCommand.activate_cmd is None:
            CondaCommand.activate_cmd = get_conda_activate_cmd()
            logger.debug("Activate command initialised to %s",
                         CondaCommand.activate_cmd)
        if cmd:
            super().__init__(CondaCommand.activate_cmd,
                             *cmd,
                             **kwargs,
                             name=name or cmd[0])


class ClangTools:
    _llvm_path = "${CONDA_PREFIX}"

    @staticmethod
    def path():
        return os.path.join(ClangTools._llvm_path, "bin")

    @staticmethod
    def clang_format():
        return os.path.join(ClangTools.path(), "clang-format")

    @staticmethod
    def clang_tidy():
        return os.path.join(ClangTools.path(), "clang-tidy")

    @staticmethod
    def clang_apply_replacements():
        return os.path.join(ClangTools.path(), "clang-apply-replacements")


def get_conda_activate_cmd():
    """Check if we're already inside a Conda environment, if not return the
    command to run to activate one"""
    if "CONDA_PREFIX" in os.environ:
        logger.debug("Conda environment active, nothing to do")
        return ""
    sources_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
    activate_script = os.path.join(sources_dir, ".linters",
                                   "activate_buildenv.sh")
    if not os.path.isfile(activate_script):
        error = ["No active Conda environment, you need to either activate "\
                    "it or create a link to it:",
                 ". ../build/activate_buildenv.sh",
                 "or",
                 f"ln -sf /my/build/activate_buildenv.sh {activate_script}"
                ]
        raise RuntimeError("\n".join(error))
    return f". {activate_script};"


def offset_to_line(filename, offsets):
    """Convert a list of offsets in a file to a dictionary of line, column.
    [ offset ] -> { offset: (line,column) }
    """
    if not filename:
        return {offset: (0, 0) for offset in offsets}
    offsets = sorted(set(offsets))
    line = 1
    mappings = {}
    file_offset = 0
    try:
        it = iter(offsets)
        offset = next(it)
        for l in open(filename):
            start_line_offset = file_offset
            file_offset += len(l)
            while offset < file_offset:
                mappings[offset] = (line, offset - start_line_offset + 1)
                offset = next(it)
            line += 1
    except StopIteration:
        return mappings
    raise RuntimeError(f"Invalid offset {offset} (File length: {file_offset})")


class DiffCreator:
    """Create a diff between the output of a command and the content of file.
    Some linters (for example yapf) print the modified file to stdout instead
    of modifying it in-place.
    This class will create a diff with the original file and print the
    differences.
    If autofix is enabled, the content of the original file
    will be replaced.
    """

    def __init__(self, filename, linter, autofix):
        self.filename = filename
        self.linter = linter
        if autofix:
            self.linter += "(autofix)"
        self.autofix = autofix

    def __call__(self, output, errcode):
        """Called by Command with the output of the linter"""
        origin = open(self.filename).readlines()
        new = output.splitlines(True)
        delta = ""
        for line in difflib.unified_diff(origin,
                                         new,
                                         fromfile="a/" + self.filename,
                                         tofile="b/" + self.filename):
            m = re.match(r"@@ -(\d+),.*@@", line)
            if m:
                print(f"{self.filename}:{int(m.group(1))+3}:error:"
                      f"[{self.linter}] to fix run "
                      "./scripts/apply_linters.py --autofix")
            delta += line
        if delta:
            if self.autofix:
                with open(self.filename, "w") as f:
                    f.write(output)
            else:
                print(f"{self.linter} found the following issues in "
                      f"{self.filename}\n{delta}")
            errcode = 1
        return delta, errcode


class VersionParseCommandBase(CondaCommand):
    def __init__(self, *cmd, **kwargs):
        super().__init__(*cmd, **kwargs)
        self.version = None

    def _parse_version(self, output, return_code):
        raise NotImplementedError("Must be implemented in the derived type")

    def run_and_compare_versions(self, expected):
        self.run()
        expected_version = packaging.version.parse(expected)

        if expected_version != self.version:
            logger.error("Required version of %s is %s, but found %s",
                         self.name, expected_version, self.version)
            return False
        return True


class VersionJSONParseCommand(VersionParseCommandBase):
    def __init__(self, command_name):
        super().__init__(
            f"grep \\\"version\\\" "
            f"${{CONDA_PREFIX}}/conda-meta/{command_name}-*.json",
            print_output=False,
            output_processor=self._parse_version)

    def _parse_version(self, output, return_code):
        if return_code:
            return output, return_code
        self.version = packaging.version.parse(
            json.loads("{" + output + "}")["version"])
        return output, return_code


class VersionParseCommand(VersionParseCommandBase):
    def __init__(self, version_re_prefix, command_name):
        super().__init__(command_name,
                         "--version",
                         print_output=False,
                         output_processor=self._parse_version)
        self.version_re_prefix = version_re_prefix
        self.version = None

    def _parse_version(self, output, return_code):
        if return_code:
            return output, return_code

        match_result = re.search(f"{self.version_re_prefix} ([.0-9]+)",
                                 output,
                                 flags=re.MULTILINE)

        if match_result:
            self.version = packaging.version.parse(match_result[1])

        return output, return_code


def compare_versions_from_conda(command_name, expected):
    version_parse_cmd = VersionJSONParseCommand(command_name)
    return version_parse_cmd.run_and_compare_versions(expected)


def compare_versions_from_output(command_name,
                                 expected,
                                 version_re_prefix=None):
    if version_re_prefix is None:
        version_re_prefix = command_name

    version_parse_cmd = VersionParseCommand(version_re_prefix, command_name)
    return version_parse_cmd.run_and_compare_versions(expected)


class ClangFormat(ILinter):
    def gen_lint_command(self, filename, autofix):
        flags = ""
        output_processor = None
        if autofix:
            flags += " -i"
        else:
            output_processor = DiffCreator(filename, "clang-format", autofix)

        return CondaCommand(ClangTools.clang_format(),
                            flags,
                            filename,
                            output_processor=output_processor,
                            print_output=autofix)

    def check_version(self):
        return compare_versions_from_output(ClangTools.clang_format(),
                                            "13.0.1", "version")


class ClangTidy(ILinter):
    class ResultsProcessor(OutputProcessor):
        """Wait for all the jobs to complete then combine and process their
        outputs
        """

        def __init__(self, num_jobs, autofix):
            self.num_jobs = num_jobs
            self.tmp_folder = tempfile.TemporaryDirectory(
                prefix="poptorchLinter_")
            self.autofix = autofix

        def __call__(self, raw_output, returncode):
            self.num_jobs -= 1
            logger.debug("1 clang-tidy job completed, %d remaining",
                         self.num_jobs)
            logger.debug("clang-tidy output: %s", raw_output)
            if self.num_jobs == 0:
                diagnostics = []
                # Combine the diagnostics from the different reports
                for f in pathlib.Path(self.tmp_folder.name).glob("*.yaml"):
                    with open(f) as file:
                        res = yaml.full_load(file)
                        # Combine the diagnostics
                        diagnostics += res.get("Diagnostics", [])

                # Error messages are linked to a file + offset
                # Collect the "offsets" used for each filename
                offsets = collections.defaultdict(list)
                for diag in diagnostics:
                    msg = diag["DiagnosticMessage"]
                    offsets[msg["FilePath"]].append(msg["FileOffset"])

                # Create a map of map that linking offsets in files to their
                # corresponding line and column:
                # line_mapping[filename] = { offset : (line, col) }
                line_mappings = {}
                for filename, file_offsets in offsets.items():
                    # Don't lint files in the build folder
                    if not os.path.isabs(filename):
                        continue
                    line_mappings[filename] = offset_to_line(
                        filename, file_offsets)
                printed = []
                for diag in diagnostics:
                    msg = diag["DiagnosticMessage"]
                    filename = msg["FilePath"]
                    # Don't lint files in the build folder
                    if not os.path.isabs(filename):
                        continue
                    line, col = line_mappings[filename][msg["FileOffset"]]
                    error = "error"
                    if self.autofix and msg["Replacements"]:
                        error += " (autofixed)"
                    output = f"{filename}:{line}:{col}: {error}: "
                    output += f"{msg['Message']} [{diag['DiagnosticName']}]"

                    # If this message has already been printed: skip it
                    if output in printed:
                        continue
                    if not printed:
                        print("Output of clang-tidy:")
                    print(output)
                    printed.append(output)
                if not printed and returncode != 0:
                    # If we didn't manage to parse the diagnostics but clang-tidy
                    # returned a failure at least print the raw output.
                    print(raw_output)
                # Apply the fixes using clang-apply-replacements
                if self.autofix:
                    CondaCommand(ClangTools.clang_apply_replacements(),
                                 self.tmp_folder.name).run()
            return raw_output, returncode

    def __init__(self):
        self.configs = []
        self.includes = []
        self.compile_commands = {}

    def get_compile_commands_flags(self, filename):
        if filename.endswith("cpp"):
            if filename in self.compile_commands:
                return self.compile_commands[filename]
            logger.warning(
                "%s is absent from compile_commands.json: check "
                "CMakeLists.txt to make sure it's compiled", filename)
            # Fall through to header path to try to find
            # flags for files in the same folder

        folder = os.path.dirname(filename)
        filename = os.path.basename(filename)
        path = folder.split(os.path.sep)
        # If it's a public header then it will be in
        # poptorch/component/include/component/my_header.hpp
        # and the cpp files will be in /component/source/
        #
        # Therefore we need to replace "include/component" with "source"
        # to find a cpp file with the compilation flags we want.
        if "include" in path:
            # Remove folders in path up to "include"
            while path.pop() != "include":
                continue

            # Types is a sub module in popart_compiler, so we want to go up one more level.
            if path[-1] == "types":
                path.pop()

            # TODO(T49191) lower_to_poplar, dialect and pytorch_bridge don't
            # have their sources in a "source" subfolder at the moment.
            exceptions = ["lower_to_poplar", "pytorch_bridge", "dialect"]
            if not "source" in path and not any(comp in path
                                                for comp in exceptions):
                # Point at "source" instead
                path.append("source")
        # else it's a private header: nothing to do, it's already in the same
        # folder as the source files.
        folder = os.path.join(*path)

        for path, flags in self.compile_commands.items():
            if path.startswith(folder):
                logger.debug("Found flags for folder %s", folder)
                return flags
        logger.warning("No compilation flags found for folder %s", folder)
        return ("", "")

    def gen_lint_command(self, filename, autofix):
        if not self.configs:
            self.check_version()
        gcc_flags, work_dir = self.get_compile_commands_flags(filename)
        flags = "-std=c++17 -fsized-deallocation -DONNX_NAMESPACE=onnx "
        flags += gcc_flags
        flags += " -I" + " -I".join(self.includes)
        cd = ""
        if work_dir:
            cd = f"cd {work_dir};"

        commands = []
        results = ClangTidy.ResultsProcessor(len(self.configs), autofix)
        # Clang-tidy has a lot of checks so we run them in parallel in
        # different processes
        for i, c in enumerate(self.configs):
            report = os.path.join(results.tmp_folder.name, f"report_{i}.yaml")
            commands.append(
                CondaCommand(cd,
                             ClangTools.clang_tidy(),
                             "--quiet",
                             os.path.realpath(filename),
                             f"--export-fixes={report}",
                             c,
                             "--",
                             flags,
                             name=("clang-tidy --quiet "
                                   f"{filename} -- {flags}"),
                             output_processor=results,
                             print_output_on_error=False,
                             print_output=False))
        return commands

    def process_compile_commands(self, commands):
        # Some flags are not supported by clang-tidy
        unsupported_flags = ["-fno-semantic-interposition"]
        for c in commands:
            gcc_flags = c["command"].split()
            cmd = " ".join(
                [f for f in gcc_flags if f not in unsupported_flags])
            m = re.match(".*/poptorch/(.*)", c["file"])
            assert m, f"Couldn't find '/poptorch/' in {c['file']}"

            # Exception we've got nested "poptorch" folders, so make sure
            # the path is the correct one.
            file_maybe = m.group(1)
            if not os.path.exists(file_maybe):
                file_maybe = os.path.join("poptorch", file_maybe)

            if not os.path.exists(file_maybe):
                logger.warning(
                    "compile_commands.json: %s/%s ignored: neither file exist",
                    m.group(1), file_maybe)
            self.compile_commands[file_maybe] = (cmd, c["directory"])

    # pylint: disable=too-many-return-statements
    def check_version(self):
        config = []
        self.configs = []

        def parse_config(output, returncode):
            nonlocal config
            config = output.splitlines(True)
            # For some reason clang-tidy's config contains these options it doesn't support, so filter them out.
            excludes = [
                "FunctionHungarianPrefix", "MethodHungarianPrefix",
                "NamespaceHungarianPrefix"
            ]
            config = [
                line for line in config if not any(e in line for e in excludes)
            ]
            return output, returncode

        def parse_checks(output, returncode):
            nonlocal config
            # Ignore first line it's the header
            all_checks = output.splitlines()[1:]
            checks_per_thread = 40
            for offset in range(0, len(all_checks), checks_per_thread):
                checks = all_checks[offset:offset + checks_per_thread]
                config[1] = "Checks: '" + ",".join(checks) + "'\n"
                self.configs.append("--config=\"" + "".join(config) + "\"")
            return output, returncode

        def parse_include_tests(output, returncode):
            if output:
                returncode = 1
            return output, returncode

        def parse_system_includes(output, returncode):
            if returncode:
                logger.error("Failed to find system includes: %s", output)
                return output, returncode
            include_path_section = False
            for line in output.split("\n"):
                if "search starts here" in line:
                    include_path_section = True
                if include_path_section and line.startswith(" "):
                    logger.debug("Adding %s to includes", line)
                    self.includes.append(line.rstrip())
            return output, returncode

        def parse_compile_commands_file(output, returncode):
            if returncode:
                logger.error("compile_commands.json not found. "
                             "Make sure to build PopTorch first.")
                return output, returncode

            self.process_compile_commands(json.loads(output))
            return output, returncode

        if CondaCommand("g++ -E -x c++ - -v < /dev/null",
                        print_output=False,
                        output_processor=parse_system_includes).run():
            return False

        if CondaCommand(ClangTools.clang_tidy() + " --dump-config",
                        print_output=False,
                        output_processor=parse_config).run():
            return False
        if CondaCommand(ClangTools.clang_tidy() + " --list-checks",
                        print_output=False,
                        output_processor=parse_checks).run():
            return False
        tests = [
            f"test -d {i} || echo \"Include folder {i} not found\""
            for i in self.includes
        ]
        if CondaCommand(";".join(tests),
                        stop_on_error=False,
                        output_processor=parse_include_tests).run():
            return False

        # Check if there is a compile_commands.json
        if CondaCommand("cat ${CONDA_PREFIX}/../compile_commands.json",
                        print_output=False,
                        output_processor=parse_compile_commands_file).run():
            return False

        return compare_versions_from_output(ClangTools.clang_tidy(), "13.0.1",
                                            "version")

    def is_enabled(self, filename, autofix):
        # Don't run Clang Tidy on the pybind11 modules because we don't know
        # where pybind headers are.
        return "custom_cube_op.cpp" not in filename and \
                "python/" not in filename


class Pylint(ILinter):
    def pylint(self):
        return "${CONDA_PREFIX}/bin/pylint"

    def gen_lint_command(self, filename, autofix):
        return CondaCommand(
            self.pylint(), "--score=no --reports=no -j 0 --msg-template="
            "'{path}:{line}:{column}:error:pylint[{symbol}({msg_id})]: {msg}'"
            " --rcfile=.pylintrc", filename)

    def check_version(self):
        return compare_versions_from_output(self.pylint(), "2.7.2", "pylint")

    def is_enabled(self, filename, autofix):  # pylint: disable=unused-argument
        # Don't run PyLint on the buildenv config files
        return re.match(r".*\.buildenv\.py$", filename) is None


class Yapf(ILinter):
    def yapf(self):
        return "${CONDA_PREFIX}/bin/yapf"

    def gen_lint_command(self, filename, autofix):
        flags = yapf_flags
        output_processor = None
        if autofix:
            flags += " -i"
        else:
            output_processor = DiffCreator(filename, "yapf", autofix)

        return CondaCommand(self.yapf(),
                            flags,
                            filename,
                            output_processor=output_processor,
                            print_output=autofix)

    def check_version(self):
        return compare_versions_from_output(self.yapf(), "0.27.0", "yapf")


class Executor:
    def __init__(self, filename, cmd):
        self.filename = filename
        self.cmd = cmd
        self.returncode = 0
        self._next_step()

    def _next_step(self):
        for step in self.cmd[0]:
            step.start()

    def update(self):
        for s in self.cmd[0]:
            if s.is_running():
                return
        # All steps complete for this command:
        for s in self.cmd[0]:
            self.returncode += s.wait()
        self.cmd = self.cmd[1:]
        if self.cmd:
            self._next_step()
        elif self.returncode:
            print(f"{self.filename}:error: contains linting errors: "
                  "run ./scripts/apply_linters.py --autofix")

    def execution_complete(self):
        return not self.cmd


class Linters:
    """Interface class used to lint files"""

    def __init__(self):
        self._linters = [CppLinters(), PyLinters()]

    def _get_git_files(self, strategy):
        files = []

        class GetFiles:
            def __init__(self, files):
                self.files = files

            def __call__(self, output, returncode):
                # If we keep the last element of each line we will have the files we need to lint.
                # ['M', 'poptorch/source/dispatch_tracer/RegisterAtenOverloads.cpp']
                # ['R092', 'poptorch/source/dispatch_tracer/dispatchers/Tracer.hpp', 'poptorch/source/dispatch_tracer/dispatchers/IDispatch.hpp']
                # ['A', 'poptorch/source/dispatch_tracer/dispatchers/JitDispatch.hpp']
                for line in output.splitlines():
                    self.files.append(line.split()[-1])
                return output, returncode

        assert isinstance(strategy, GitStrategy)
        git_cmd = ""
        filter_cmd = "| grep \"^[AMRT]\" "
        if strategy in [GitStrategy.Master, GitStrategy.PreCommit]:
            git_cmd = "git diff --name-status -r origin/mk2-main "
        elif strategy == GitStrategy.Head:
            git_cmd = "git diff --name-status -r HEAD^ "
        elif strategy == GitStrategy.Diff:
            git_cmd = "git diff --name-status -r HEAD "
        elif strategy == GitStrategy.All:
            git_cmd = "git ls-tree --name-only -r HEAD "
            filter_cmd = ""
        else:
            raise RuntimeError(f"Unknown strategy requested {strategy}")
        Command(git_cmd,
                filter_cmd,
                print_output=False,
                output_processor=GetFiles(files)).run()
        return files

    def lint_git(self, strategy, autofix, add_trailer_on_success):
        return self.lint_files(self._get_git_files(strategy), autofix,
                               add_trailer_on_success)

    def _read_head_trailer(self):
        out = SaveOutput()

        Command("git show -s --pretty='%(trailers:key=Lint-Ok,valueonly)'",
                print_output=False,
                output_processor=out).run()
        return out.output.splitlines()[0].strip().rstrip()

    def _unstaged_diff(self, files):
        out = SaveOutput()

        Command("git diff " + " ".join(files),
                print_output=False,
                output_processor=out).run()
        return out.output.strip().rstrip()

    def _compute_git_trailer(self, files):
        diff_content = ""
        for f in sorted(files):
            with open(f, "r", encoding="utf-8") as src:
                diff_content += src.read()

        return str(hashlib.md5(diff_content.encode("utf-8")).hexdigest())

    def check_git_trailer(self, strategy):
        return self._check_trailer(self._get_git_files(strategy),
                                   add_if_missing=False)

    def _check_trailer(self, files, add_if_missing):
        head_trailer = self._read_head_trailer()
        files_trailer = self._compute_git_trailer(files)
        if files_trailer == head_trailer:
            logger.info("Git trailer present and up to date")
            return 0

        if add_if_missing:
            logger.warning(
                "Files are linted but trailer is either missing or out of "
                "date, updating it:")
            git_cmd = (
                "echo \"$(git log -1 --pretty=format:%B | "
                "git interpret-trailers --if-exists='replace' --trailer "
                f"'Lint-Ok: {files_trailer}' --if-exists=replace)\" | "
                "git commit --amend --no-edit -F -")
            Command(git_cmd).run()
            logger.warning(
                "If you were trying to push your local branch to Github, "
                "try again.")
        else:
            logger.error(
                "Files haven't been linted: expected the git trailer to be "
                "'%s' but found '%s'", files_trailer, head_trailer)
        return -1

    def lint_files(self, files, autofix, add_trailer_on_success):
        # If there is no local change and the trailer is up to date: no need to re-run the linters.
        if add_trailer_on_success and self._unstaged_diff(
                files) == "" and self._read_head_trailer(
                ) == self._compute_git_trailer(files):
            logger.info(
                "Git trailer already present and up to date: early return")
            return 0

        jobs = {}
        for f in files:
            cmd = self._gen_lint_commands(f, autofix)
            if cmd:
                jobs[f] = cmd
        if not jobs:
            logger.info("No linter to run: early return")
            return 0
        executors = []
        returncode = 0
        for filename, cmd in jobs.items():
            print(f"Linting file {filename} [{len(cmd)}] commands to run")
            if autofix:
                executors.append(Executor(filename, cmd))
            else:
                # No risk of conflicting modification in place
                # Merge the steps from all the linters
                all_steps = []
                for c in cmd:
                    all_steps += c
                executors.append(Executor(filename, [all_steps]))
        still_running = True
        while still_running:
            still_running = False
            ProcessManager.get().update()
            for e in executors:
                if e.execution_complete():
                    returncode += e.returncode
                    continue
                e.update()
                still_running = True
            time.sleep(1)
        if add_trailer_on_success:
            diff = self._unstaged_diff(files)
            if diff != "":
                logger.warning(
                    "Your commit needs to be amended to include the following "
                    "changes:\n%s", diff)
            if returncode == 0:
                return self._check_trailer(files, add_if_missing=True)
        return returncode

    def _gen_lint_commands(self, filename, autofix):
        cmd = []
        for linter in self._linters:
            cmd += linter.gen_lint_commands(filename, autofix)
        return [[c] if isinstance(c, Command) else c for c in cmd]


def main():
    parser = argparse.ArgumentParser()
    # TODO Add option to exclude some linters (e.g -no-clang-tidy)
    # TODO Check / update Copyrights
    parser.add_argument("--debug",
                        "-d",
                        action="store_true",
                        help="Print debug messages")
    parser.add_argument("--autofix",
                        "-a",
                        action="store_true",
                        help="Automatically apply fixes when possible")

    parser.add_argument(
        "--add-trailer-on-success",
        "-t",
        action="store_true",
        help="Add a git trailer to the commit message on success.",
    )

    parser.add_argument(
        "--check-trailer",
        "-c",
        action="store_true",
        help=
        "Check the git trailer in HEAD and raise an error if it's invalid.",
    )

    parser.add_argument(
        "--git-strategy",
        "-s",
        type=str,
        choices=[v.value for _, v in GitStrategy.__members__.items()],
        default=GitStrategy.Master.value,
        help="Strategy to use when no files are passed")
    parser.add_argument("--jobs",
                        "-j",
                        type=int,
                        default=_utils.get_nprocs(),
                        help="Number of cores to use for linting (0 = auto)")
    parser.add_argument("files", nargs="*", help="one or more files to lint")
    args = parser.parse_args()

    logging_level = logging.DEBUG if args.debug else logging.INFO
    logging.basicConfig(level=logging_level)
    logger.debug("Args: %s", str(args))

    if args.jobs:
        assert args.jobs >= 0
        ProcessManager.create(args.jobs)

    linters = Linters()
    if args.check_trailer:
        assert not args.files, ("You cannot pass a list of files and use "
                                "--check-trailer at the same time")
        return linters.check_git_trailer(GitStrategy(args.git_strategy))

    # Check we've got a Conda environment available
    CondaCommand()
    strategy = GitStrategy(args.git_strategy)
    # PRE_COMMIT is a special case because it will pass on the command line
    # the files which have been modified in the last commit but also set some
    # environment variables to indicate where to find the whole branch.
    # As we want to lint all the files on the branch, we need to ignore the
    # files provided on the command line.
    if args.files and strategy != GitStrategy.PreCommit:
        return linters.lint_files(args.files, args.autofix,
                                  args.add_trailer_on_success)
    print(f"Linting files selected by the git strategy '{args.git_strategy}'")
    return linters.lint_git(strategy, args.autofix,
                            args.add_trailer_on_success)


if __name__ == "__main__":
    sys.exit(main())


================================================
FILE: scripts/check_spelling.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.

import ast
import glob
import os
import re
import shlex
import shutil
import signal
import sys
import termios
import threading
import tty

from utils import _utils

CUSTOM_DIC_PATH = "docs/common/custom_dic"

HUNSPELL_CMD = [
    "hunspell",
    "-a",  # Pipe mode
    "-d",
    "en_GB",  # Graphcore uses en_GB for documentation
    "-i",
    "utf-8",  # Encoding: suitable for linux and osx
    "-mode=none"
]  # Use raw text

TERM_STDIN = sys.stdin


def getChar():
    try:
        # Backup this or the terminal will break on closing
        old_attr = termios.tcgetattr(TERM_STDIN.fileno())
        tty.setraw(TERM_STDIN.fileno())
        char = TERM_STDIN.read(1)
    finally:
        # Reset the terminal
        termios.tcsetattr(TERM_STDIN.fileno(), termios.TCIFLUSH, old_attr)
    return char


class DocStr():
    def __init__(self, doc_str, source_file, line_num):
        self._doc_str = doc_str
        self._source_file = source_file
        self._line_num = line_num

    @property
    def doc_str(self):
        return self._doc_str

    @property
    def line_num(self):
        return self._line_num

    @property
    def source_file(self):
        return self._source_file

    def __str__(self):
        s = f"{self._line_num}:" + self._doc_str
        return s


def start_hunspell_process():
    # Add custom dictionary first time only
    if "-p" not in HUNSPELL_CMD:
        custom_dic_path = os.path.join(_utils.sources_dir(), CUSTOM_DIC_PATH)

        if not os.path.exists(custom_dic_path):
            open(custom_dic_path, 'a').close()

        HUNSPELL_CMD.append("-p")
        HUNSPELL_CMD.append(shlex.quote(custom_dic_path))

    hunspell_output = []

    def out_handler(line):
        hunspell_output.append(line)

    # subprocess.Popen fails to pass the filename correctly without this when
    # shell=True. shlex.quote will handle any spaces correctly.
    cmd = " ".join(HUNSPELL_CMD)

    hunspell_proc = _utils.Process(cmd,
                                   env=None,
                                   redirect_stderr=True,
                                   stdout_handler=out_handler,
                                   bufsize=0)

    # First line is just a version
    while len(hunspell_output) < 1:
        assert hunspell_proc.is_running()
    hunspell_output.clear()

    return {'proc': hunspell_proc, 'out': hunspell_output}


CODE_BLOCK = re.compile(r"\.\. code-block::[^\n]+\n\n.*?\n\n", flags=re.DOTALL)


def strip_code_blocks(s):
    s_list = list(s)
    for match in CODE_BLOCK.finditer(s):
        for pos in range(match.start(), match.end()):
            # Preserve lines by replacing everything except new lines with
            # spaces
            if s_list[pos] != "\n":
                s_list[pos] = " "
    return "".join(s_list)


def should_skip(line):
    stripped_line = line.strip()
    if stripped_line.startswith(">>>"):
        return True

    if stripped_line.startswith("..."):
        return True

    return False


ALL_EXCLUSIONS = (re.compile(r":param [^:]+:"), re.compile(r"p[0-9]+[^0-9]"),
                  re.compile(r":py:[^:]+:"), re.compile(r"T[0-9]+[^0-9]"),
                  re.compile(r"`+[^`]+`+"), re.compile(r":r?type.*"))


def remove_exclusions(line):
    for exclusion in ALL_EXCLUSIONS:
        line = exclusion.sub("", line)

    line = line.replace(".. seealso::", "")

    return line


def get_doc_str_line_number(element):
    # Handle the case of lots of parameters etc
    if isinstance(element.body[0], ast.Expr):
        if isinstance(element.body[0].value, ast.Str):
            end_line_no = element.body[0].value.lineno
            doc_str_lines = element.body[0].value.s.count("\n")
            return end_line_no - doc_str_lines

    # If the string lookup fails
    return element.lineno


DOC_STR_ELEMENTS = (ast.AsyncFunctionDef, ast.FunctionDef, ast.ClassDef,
                    ast.Module)


def recursive_add_doc_str(source_file, element, doc_str_list):

    for sub_element in element.body:
        if isinstance(sub_element, DOC_STR_ELEMENTS):
            doc_str = ast.get_docstring(sub_element)

            if doc_str is not None:
                doc_str_list.append(
                    DocStr(doc_str, source_file,
                           get_doc_str_line_number(sub_element)))

        if hasattr(sub_element, "body"):
            recursive_add_doc_str(source_file, sub_element, doc_str_list)


BLACK_ON_WHITE = "\033[30;107m"
RESET_COLOR = "\033[39;49m"
UNDERLINE = "\033[4m"
NOT_UNDERLINE = "\033[24m"


def print_context(doc_str, line_offset, unknown_spelling):
    print(BLACK_ON_WHITE, end='')

    all_lines = doc_str.doc_str.split("\n")

    for line_num, line in enumerate(all_lines):
        if line_num == line_offset:

            # Make sure we find the right incident of spelling
            pattern = unknown_spelling + r"[^a-z]"
            match_start = re.search(pattern, line + " ").start()

            before = line[:match_start]
            print(before, end='')

            print(UNDERLINE, end='')
            print(unknown_spelling, end='')
            print(NOT_UNDERLINE, end='')

            after = line[match_start + len(unknown_spelling):]
            print(after, end='')
        else:
            print(line, end='')

        if line_num + 1 != len(all_lines):
            print()

    print(RESET_COLOR + "\n")


def process_incorrect_word(hunspell, result, doc_str, line_offset):
    result = result.split(" ")

    symbol = result[0]
    if symbol not in ("&", "#"):
        raise RuntimeError("Invalid symbol")

    unknown_spelling = result[1]

    line_num = doc_str.line_num + line_offset

    while True:
        print_context(doc_str, line_offset, unknown_spelling)
        print(f"Unknown spelling, '{unknown_spelling}' on line {line_num}"
              f" ({doc_str.source_file}).")

        if symbol == b"&":
            # Comma seprated list of suggestions
            suggestions = [r.decode("utf-8") for r in result[4:]]
            print("Suggestions: " + " ".join(suggestions))

        print("(space): continue, (a)dd to dictionary, (q)uit")
        c = getChar()

        if c == ' ':
            break
        if c == 'a':
            # Add to dictionary and save
            hunspell['proc'].write(b"*")
            hunspell['proc'].write(unknown_spelling.encode("utf-8"))
            hunspell['proc'].write(b"\n")
            hunspell['proc'].write(b"#\n")
            break
        # Ctrl+c and ctrl+z are intercepted
        if c in ('q', '\x03', '\x04'):  # ^C and ^D
            sys.exit(0)
        if c == '\x1a':  # ^Z
            signal.pthread_kill(threading.get_ident(), signal.SIGSTOP)

    print("\n\n\n\n")


def process_doc_str(hunspell, doc_str):
    all_doc_str = doc_str.doc_str
    all_doc_str = strip_code_blocks(all_doc_str)

    all_lines = all_doc_str.split("\n")
    for line_offset, line in enumerate(all_lines):
        if should_skip(line):
            continue

        line = remove_exclusions(line)

        full_line = b"^"  # Escape any commands
        full_line += line.encode('utf-8') + b"\n"

        hunspell['proc'].write(full_line)

        while True:
            if len(hunspell['out']) == 0:
                assert hunspell['proc'].is_running()
                continue

            next_token = hunspell['out'].pop(0)
            if next_token == "":
                break

            if (next_token == "*" or next_token == "-"
                    or next_token[0] == "+"):
                continue
            process_incorrect_word(hunspell, next_token, doc_str, line_offset)


def check_source_file(source_dir, source_file):
    source_file_without_root = source_file[len(source_dir) + 1:]
    print(f"Checking {source_file_without_root}\n")

    with open(source_file, 'r') as f:
        source = f.read()

    ast_module = ast.parse(source, source_file)

    all_doc_str = []
    recursive_add_doc_str(source_file_without_root, ast_module, all_doc_str)

    hunspell = start_hunspell_process()

    for doc_str in all_doc_str:
        process_doc_str(hunspell, doc_str)

    hunspell['proc'].eof()
    hunspell['proc'].wait()


if __name__ == "__main__":
    if _utils.get_os_type() != _utils.OsType.Linux:
        print("Not running on linux.")
        sys.exit(1)

    if shutil.which(HUNSPELL_CMD[0]) is None:
        print(f"Please install {HUNSPELL_CMD[0]}.")
        sys.exit(1)

    source_dir = os.path.join(_utils.sources_dir(), "python")

    for source_file in glob.glob(os.path.join(source_dir, "*.py")):
        check_source_file(source_dir, source_file)


================================================
FILE: scripts/create_buildenv.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import argparse
import collections
import contextlib
import fcntl
import hashlib
import inspect
import logging
import re
import os
import platform
import subprocess
import sys
import tarfile
import urllib.request

from utils import _utils

logger = logging.getLogger(os.path.basename(__file__))
_utils.set_logger(logger)

_conda_toolchains_packages = ["gcc_linux-64=7.3.0", "gxx_linux-64=7.3.0"]


class Version:
    def __init__(self, version_str):
        self.version = tuple(int(i) for i in version_str.split("."))

    def __lt__(self, other):
        return self.version < other.version

    def __eq__(self, other):
        return self.version == other.version

    def __hash__(self):
        return hash(self.version)

    def __str__(self):
        return ".".join([str(v) for v in self.version])

    def __repr__(self):
        return str(self)


def _default_cache_dir():
    return os.environ.get("CONDA_CACHE_DIR",
                          os.path.join(_utils.sources_dir(), ".cache"))


def _system_conda_path():
    #pylint: disable=broad-except
    try:
        conda_root = subprocess.check_output(["conda", "info", "--base"],
                                             stderr=None)
        conda_root = conda_root.decode("utf-8").strip()
        return conda_root
    except (FileNotFoundError, Exception):
        logger.debug('Conda Root Not Found')
        return None
    #pylint: enable=broad-except


class Installer:
    """Common interface for all installers"""

    def install(self, env):
        raise Exception(f"Must be implemented by child class {type(self)}")

    def hashString(self):
        """Unique string identifying this version of the installer."""
        raise Exception(f"Must be implemented by child class {type(self)}")


class CondaPackages(Installer):
    """Install the list of Conda packages in the environment."""

    def __init__(self, *packages):
        assert all(isinstance(p, str) for p in packages)
        self.packages = packages


class CondaChannels(Installer):
    """Enable extra Conda channels."""

    def __init__(self, *channels):
        assert all(isinstance(c, str) for c in channels)
        self.channels = channels


class PipPackages(Installer):
    """Install the list of pip3 packages in the environment."""

    def __init__(self, *packages):
        assert all(isinstance(p, str) for p in packages)
        self.packages = packages

    def install(self, env):
        env.run_commands("pip3 install " + " ".join(self.packages))

    def hashString(self):
        return " ".join(self.packages)


class PipRequirements(Installer):
    """Install pip3 packages from a requirements file."""

    def __init__(self, filename="requirements.txt"):
        if not filename.startswith("/"):
            filename = os.path.join(os.getcwd(), filename)
        self._requirements_file = filename

    def install(self, env):
        env.run_commands(
            f"pip3 install -r {self._requirements_file} --retries 30")

    def hashString(self):
        with open(self._requirements_file, "r") as f:
            return f.read()


class Installers:
    """Contains the list of installers to install in the environment."""

    def __init__(self):
        self._installers = []

    def add(self, installer):
        assert isinstance(
            installer,
            Installer), "All package installers must inherit from Installer"
        self._installers.append(installer)

    def __call__(self):
        return self._installers


class Config:
    """Contains the configuration for the environment."""

    def __init__(self, install_linters, **opts):
        self.__dict__ = opts
        self.install_linters = install_linters

    def setDefault(self, **opts):
        for k, v in opts.items():
            if k not in self.__dict__:
                self.__dict__[k] = v


class Environment:
    def __init__(self, buildenv_dir, activate_filename):
        self._buildenv_dir = buildenv_dir
        self._activate_filename = activate_filename

    @property
    def prefix(self):
        return self._buildenv_dir

    def run_commands(self,
                     *cmds,
                     env=None,
                     stop_on_error=True,
                     stdout_handler=None,
                     stderr_handler=None):
        _utils.run_commands(f". {self._activate_filename}",
                            *cmds,
                            env=env,
                            stop_on_error=stop_on_error,
                            stdout_handler=stdout_handler,
                            stderr_handler=stderr_handler)

    def rmdir_if_exists(self, path):
        _utils.rmdir_if_exists(path)


class BuildenvManager:
    def __init__(self,
                 cache_dir=None,
                 output_dir=None,
                 python_version=None,
                 use_conda_toolchains=False,
                 install_linters=False,
                 empty_env=False,
                 **config):
        if python_version is None:
            python_version = platform.python_version()
            if python_version.startswith("3.6"):
                python_version = "3.7"
                logger.warning(
                    "Python 3.6 is no longer supported, defaulting "
                    "to %s, if you really want to "
                    "use python 3.6 then use --python-version 3.6",
                    python_version)

        self.output_dir = os.path.realpath(output_dir or os.getcwd())
        self.cache_dir = cache_dir or _default_cache_dir()
        self.buildenv_dir = os.path.join(self.output_dir, "buildenv")
        self.conda_packages = [f"python={python_version}"]
        self.conda_channels = []
        if not empty_env:
            self.conda_packages.append("conda-pack=0.5.0")

        # Support for python 3.6 was removed from pip in version 22.0
        # https://pip.pypa.io/en/stable/news/#v22-0
        if python_version.startswith("3.6"):
            self.conda_packages.append("pip=21.1.3")

        is_aarch64 = _utils.get_arch_type() == "aarch64"
        if not is_aarch64 and not empty_env:
            # There is not one version of gdb which works
            # for both python 3.6.8 (CentOS 7) and python 3.9
            if python_version.startswith("3.6.8"):
                self.conda_packages.append("gdb=8.3")
            else:
                self.conda_packages.append("gdb=10.2")

        self.projects = {}

        if use_conda_toolchains:
            self.conda_packages += _conda_toolchains_packages

        self.config = Config(install_linters=install_linters,
                             is_aarch64=is_aarch64,
                             **config)
        assert self.output_dir != _utils.sources_dir(), (
            "This script needs "
            "to be called from a build directory. Try mkdir build && cd build"
            " && ../scripts/create_buildenv.py")

        # internal constants
        self.activate_filename = os.path.join(self.output_dir,
                                              "activate_buildenv.sh")
        self.env = Environment(self.buildenv_dir, self.activate_filename)
        self.lock_already_acquired = False

    def add_project(self, project, project_dir):
        assert os.path.exists(project_dir)
        self.projects[project] = os.path.realpath(project_dir)

    def _collect_installers(self):
        view_dir = os.path.dirname(_utils.sources_dir())
        installers = Installers()
        # We share with the config files all the classes inheriting from Installer
        exec_locals = {
            name: c
            for name, c in inspect.getmembers(sys.modules[__name__],
                                              inspect.isclass)
            if Installer in c.__bases__ or c == Installer
        }
        exec_locals["installers"] = installers
        exec_locals["config"] = self.config
        for p, project_dir in self.projects.items():
            # Try to find (in that order):
            # 1) <view_dir>/my_project.buildenv.py
            # 2) <view_dir>/my_project/config.buildenv.py
            to_test = [
                os.path.join(view_dir, p + ".buildenv.py"),
                os.path.join(project_dir, "config.buildenv.py"),
                os.path.join(project_dir, p + ".buildenv.py"),
            ]
            conf = None
            for f in to_test:
                if os.path.exists(f):
                    conf = f
                    break
            if conf is None:
                logger.warning(
                    "No requirements found for project '%s' (Tried %s)", p,
                    to_test)
                continue

            with open(conf, "r") as f:
                code = f.read()
                os.chdir(project_dir)
                # Share the os module as it's commonly used to get the current
                # working directory, create directories, etc.
                # pylint: disable=exec-used
                exec(code, {"os": os, "_utils": _utils}, exec_locals)

        # Process the installers:
        other_installers = []
        for i in installers():
            if isinstance(i, CondaPackages):
                self.conda_packages += i.packages
            elif isinstance(i, CondaChannels):
                self.conda_channels += i.channels
            else:
                other_installers.append(i)

        packages = collections.defaultdict(list)
        # Resolve version conflicts
        # Create a dictionary package name -> [ versions ]
        for package in self.conda_packages:
            s = package.replace("==", "=").split("=")
            name = s[0]
            version = [Version(s[1])] if len(s) > 1 else []
            packages[name] += version

        self.conda_packages = []
        # Make sure the packages are unique and in a deterministic order
        for name in sorted(packages.keys()):
            versions = packages[name]
            if not versions:
                self.conda_packages.append(name)
                logger.warning("Version for package %s is not set", name)
                continue
            # Sort the versions by descending order and remove duplicates
            versions = list(set(versions))
            versions.sort(reverse=True)
            if len(versions) > 1:
                logger.warning(
                    "Conflict: more than one version requested for "
                    "package %s: %s, selecting %s", name, versions,
                    versions[0])

            self.conda_packages.append(f"{name}={str(versions[0])}")
        return other_installers

    def create(self, create_template_if_needed=False):
        os.makedirs(self.output_dir, exist_ok=True)
        os.chdir(self.output_dir)

        self._clear_activate_buildenv()
        self._install_conda_if_needed()

        installers = self._collect_installers()
        env_hash = self._compute_environment_hash(installers)
        template_name = f"poptorch_{env_hash}.tar.gz"
        full_template_name = os.path.join(self.cache_dir, template_name)

        with self.cache_lock():
            if os.path.isfile(full_template_name):
                logger.info("Found template %s: Unpacking to %s",
                            full_template_name, self.buildenv_dir)
                os.makedirs(self.buildenv_dir)
                os.chdir(self.output_dir)
                tar = tarfile.open(full_template_name)
                tar.extractall(self.buildenv_dir)
                assert os.path.isdir(self.buildenv_dir)
                self.env.run_commands(f". {self.buildenv_dir}/bin/activate",
                                      "conda-unpack")
                self._append_to_activate_buildenv(
                    f"conda activate {self.buildenv_dir}", )
            else:
                logger.info(
                    "Didn't find template %s: creating a new "
                    "environment in %s", full_template_name, self.output_dir)
                self._create_new_env(installers)
                if create_template_if_needed:
                    os.chdir(self.output_dir)
                    self.env.run_commands(
                        f"conda activate {self.buildenv_dir}",
                        f"conda pack -p {self.buildenv_dir} -o \
                                {full_template_name}")

        if self.config.install_linters:
            self.env.run_commands(f"cd {_utils.sources_dir()}",
                                  "pre-commit install --hook-type pre-push")

        os.chdir(self.output_dir)
        # If ccache is available
        try:

            def ignore(_):
                pass

            self.env.run_commands("ccache -V",
                                  stdout_handler=ignore,
                                  stderr_handler=ignore)
            # CC / CXX -> Enable ccache for the current C / C++ compilers.
            self.env.run_commands(
                """echo "export CC=\\"ccache ${CC:-gcc}\\"" >> %s""" %
                self.activate_filename,
                """echo "export CXX=\\"ccache ${CXX:-g++}\\"" >> %s""" %
                self.activate_filename)
        except AssertionError:
            pass

    def _create_new_env(self, installers, is_retry=False):
        """
        Sometimes the Conda install in the NFS cache gets corrupted:

            CondaVerificationError: The package for setuptools located at
            /nfs/conda//miniconda/pkgs/setuptools-58.0.4-py38h578d9bd_2
            appears to be corrupted.

        When this happens: delete the conda install and start again with
        "is_retry=True" to avoid getting stuck in an infinite loop.
        """

        os.chdir(self.output_dir)
        corrupted = False

        def check_corruption(line):
            nonlocal corrupted
            if "CondaVerificationError" in line:
                corrupted = True
            logger.error(line)

        stderr_handler = None if is_retry else check_corruption
        try:
            _utils.rmdir_if_exists(self.buildenv_dir)

            def getChannels():
                return "".join(f" -c {c}" for c in self.conda_channels)

            self.env.run_commands(
                f"conda create --prefix {self.buildenv_dir}{getChannels()} "
                f"-y {' '.join(self.conda_packages)}",
                stderr_handler=stderr_handler)
        except AssertionError:
            if corrupted:
                # We failed because of some corrupted packages: clear
                # the environment, reinstall Conda and try again.
                self._clear_activate_buildenv()
                self._install_conda_if_needed(force_reinstall=True)
                self._create_new_env(installers, is_retry=True)
                return
            raise

        self._append_to_activate_buildenv(
            f"conda activate {self.buildenv_dir}", )

        for i in installers:
            os.chdir(self.output_dir)
            i.install(self.env)

    def _clear_activate_buildenv(self):
        # Clear the content of activate_buildenv.sh

        # PYTHONNOUSERSITE -> Make Conda ignore packages installed in ~/.local
        # CCACHE_CPP2 -> Switch ccache to C++ mode (Avoid issues with C pre-processor)
        with open(self.activate_filename, "w") as f:
            f.write("# Save the existing environment\n")
            f.write("_print_var_names (){\n")
            # grep: only keep lines containing 'declare -x' (Removes
            # multi-lines content as we only care about variable names).
            # cut -f1: remove the right hand side of the assignment.
            # cut -f3: remove 'declare -x'
            # tr: replace new lines with spaces.
            f.write("   export -p | grep \"declare -x\" | "
                    "cut -d '=' -f1 | cut -d ' ' -f3 | "
                    "tr '\n' ' '\n")
            f.write("}\n")
            f.write("_saved_names=$(_print_var_names)\n")
            f.write("_saved_vars=\"$(export -p)\"\n")
            f.write("_saved_ps1=\"$PS1\"\n\n")
            f.write(
                "# Use 'deactivate_buildenv' to restore your former environment\n"
            )
            # Note: using 'eval' inside a function doesn't affect the parent
            # environment, which is why we need to use an alias instead.
            f.write(
                "alias deactivate_buildenv='_deactivate;eval \"$_saved_vars\";"
                "unset _deactivate _print_var_names _saved_names "
                "_saved_vars _saved_ps1'\n")
            f.write("_deactivate() {\n")
            f.write(
                "  # Unset the variables that were added by the buildenv\n")
            f.write("  _current_vars=$(_print_var_names)\n")
            f.write("  for v in $_current_vars; do\n")
            f.write("    if [[ ! \" ${_saved_names[*]} \" =~ \" ${v} \" ]];"
                    " then\n")
            f.write("      unset \"${v}\"\n")
            f.write("    fi\n")
            f.write("  done\n")
            f.write("  # Restore the shell prompt\n")
            f.write("  PS1=\"$_saved_ps1\"\n\n")
            f.write("}\n\n")

            f.write("export PYTHONNOUSERSITE=1\n")
            f.write("export CCACHE_CPP2=yes\n")

    def _append_to_activate_buildenv(self, *lines):
        with open(self.activate_filename, "a") as f:
            for line in lines:
                f.write(f"{line}\n")

    @contextlib.contextmanager
    def cache_lock(self):
        # Handle nested cache_lock scopes: if we already own the lock then
        # don't try to lock it again.
        if self.lock_already_acquired:
            yield
            return

        lock = os.path.join(self.cache_dir, "conda.lock")
        with open(lock, "w") as f:
            try:
                fcntl.flock(f, fcntl.LOCK_EX)
                self.lock_already_acquired = True
                yield
            finally:
                self.lock_already_acquired = False
                fcntl.flock(f, fcntl.LOCK_UN)

    def _install_conda_if_needed(self, force_reinstall=False):
        os.makedirs(self.cache_dir, exist_ok=True)
        system_conda = _system_conda_path()
        if system_conda is not None:
            logger.info("Using system conda")
            conda_sh = os.path.join(system_conda, "etc", "profile.d",
                                    "conda.sh")
            self._append_to_activate_buildenv(f". {conda_sh}")
            return

        conda_install_dir = os.path.join(self.cache_dir, "mambaforge")
        conda_sh = os.path.join(conda_install_dir, "etc", "profile.d",
                                "conda.sh")
        installer = os.path.join(self.cache_dir, "Mambaforge_installer.sh")
        with self.cache_lock():
            if os.path.isfile(conda_sh) and not force_reinstall:
                logger.info(
                    "System conda not found, using the instance from the cache "
                    "(%s) instead", self.cache_dir)
            else:
                logger.info(
                    "System conda not found, installing it locally in (%s)",
                    self.cache_dir)
                if not os.path.isfile(installer):
                    logger.info("Installer not found: downloading...")
                    conda_os = ""
                    os_type = _utils.get_os_type()
                    if os_type == _utils.OsType.Linux:
                        conda_os = "Linux"
                    elif os_type == _utils.OsType.Osx:
                        conda_os = "MacOSX"
                    else:
                        raise RuntimeError(
                            "Unknown OS. Please download the "
                            "installer for your platform from "
                            "https://github.com/conda-forge/miniforge#mambaforge"
                            f" and save it as ${installer}")
                    arch_type = _utils.get_arch_type()
                    # Use Mamba that is not relied on Conda with version > 4.14.0
                    # while we wait for https://github.com/conda/conda/issues/12250
                    # to be fixed. (Issue with paths > 128 characters)
                    # As soon as it will be fixed we can use "latest" release.
                    url = ("https://github.com/conda-forge/miniforge/"
                           "releases/download/4.14.0-0/Mambaforge-"
                           f"4.14.0-0-{conda_os}-{arch_type}.sh")
                    urllib.request.urlretrieve(url, installer)
                _utils.rmdir_if_exists(conda_install_dir)
                _utils.run_commands(
                    f"bash {installer} -b -p {conda_install_dir}")
        assert os.path.isfile(conda_sh)
        self._append_to_activate_buildenv(f". {conda_sh}")

    def _compute_environment_hash(self, installers):
        hashes = [i.hashString() for i in installers]
        return str(
            hashlib.md5(
                " ".join(self.conda_packages + hashes +
                         self.conda_channels).encode("utf-8")).hexdigest())


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--debug",
                        "-d",
                        action="store_true",
                        help="Print debug messages")
    parser.add_argument(
        "--conda-toolchains",
        "-t",
        action="store_true",
        help="Use Conda toolchains instead of the system ones.")
    parser.add_argument(
        "--empty-env",
        "-e",
        action="store_true",
        help=("Create an empty Conda environment using version of python "
              "specified by --python-version"))
    parser.add_argument("--popart-deps",
                        action="store_true",
                        help="Install dependencies to build PopART.")
    parser.add_argument("--no-linters",
                        action="store_true",
                        help="Don't install the linters.")
    parser.add_argument(
        "--python-version",
        "-p",
        help="Override the default python version used in the build environment"
        "By default the build environment will use the same python version as "
        "the host os")
    parser.add_argument(
        "--cache-dir",
        help=f"Cache directory (By default {_default_cache_dir()}")
    parser.add_argument(
        "--output-dir",
        help=
        "Where to create the build environment (Current directory by default)")
    parser.add_argument(
        "--create-template-if-needed",
        action="store_true",
        help="Create a template archive in the cache directory "
        "if one doesn't already exist")
    parser.add_argument(
        "--path", help="Path to the project sources or a project.buildenv.py")

    args = parser.parse_args()

    logging_level = logging.DEBUG if args.debug else logging.INFO
    logging.basicConfig(level=logging_level)
    logger.debug("Args: %s", str(args))

    manager = BuildenvManager(args.cache_dir, args.output_dir,
                              args.python_version, args.conda_toolchains,
                              not args.no_linters, args.empty_env)
    if args.path:
        path_dir = os.path.realpath(args.path)
        project = None
        # If a file was provided: use the containing directory
        if os.path.isfile(path_dir):
            filename = os.path.basename(path_dir)
            m = re.match("(.*).buildenv.py", filename)
            if m and m.group(1) != "config":
                project = m.group(1)
            path_dir = os.path.dirname(path_dir)
        if project is None:
            project = path_dir.split(os.path.sep)[-1]
        manager.add_project(project, path_dir)
    elif not args.empty_env:
        manager.add_project("poptorch", _utils.sources_dir())
    manager.create(args.create_template_if_needed)


================================================
FILE: scripts/docs_build.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import argparse
import logging
import os
import shutil
import subprocess
import sys
import zipfile

import sphinx.cmd.build
from utils import _utils

logger = logging.getLogger(os.path.basename(__file__))
_utils.set_logger(logger)


class DocumentationBuilder:
    def __init__(self, pkg_info, install_dir=None, poptorch_geometric=False):
        self.pkg_info = pkg_info
        self.pdf_filename = pkg_info.pdf_filename(poptorch_geometric)
        self.html_filename = pkg_info.html_filename(poptorch_geometric)
        self.doc_name = pkg_info.poptorch_geometric_doc_name if \
            poptorch_geometric else pkg_info.doc_name

        self.output_dir = os.path.join(
            "docs", "poptorch_geometric") if poptorch_geometric else "docs"
        self.output_pdf_dir = os.path.join(self.output_dir, "pdf")
        self.output_html_dir = os.path.join(self.output_dir, "html")
        self.output_guide_dir = os.path.join(self.output_html_dir,
                                             self.doc_name)
        src_dir = os.path.join(
            "docs", "poptorch_geometric") if poptorch_geometric else "docs"
        self.docs_src_dir = os.path.join(_utils.sources_dir(), src_dir,
                                         "user_guide")
        self.sphinx_conf_dir = os.path.join(_utils.sources_dir(), src_dir,
                                            "common")
        self.title = _utils.get_first_line(
            os.path.join(self.docs_src_dir, "index.rst"))
        self.install_dir = install_dir or "."
        logger.debug("Document title is %s", self.title)

        # -a  write all files (default: only write new and changed files)
        # -E don't use a saved environment, always read all files
        # -n nit-picky mode, warn about all missing references
        # -W turn warnings into errors
        # -j auto: automatically select the appropriate number of threads
        self.common_sphinx_flags = "-a -E -n -W --keep-going -j auto".split(
        ) + ["-c", self.sphinx_conf_dir]

    def assert_poptorch_in_path(self):
        error = None
        try:
            import poptorch  # pylint: disable=unused-import, import-outside-toplevel
        except ImportError as e:
            error = str(e)
            error += ". poptorch must be in your PYTHONPATH to generate the "
            error += "documentation: did you enable your build environment?"
        if error:
            raise ImportError(error)

    def cleanup(self):
        _utils.rmdir_if_exists(self.output_pdf_dir)
        _utils.rmdir_if_exists(self.output_guide_dir)
        os.makedirs(self.output_guide_dir)

    def build_html(self):
        self.assert_poptorch_in_path()
        args = self.common_sphinx_flags + [
            "-b", "html", "-D", f"project={self.title}", "-D",
            f"html_title={self.title}", "-D",
            f"version=v{self.pkg_info.version_long}", self.docs_src_dir,
            self.output_guide_dir
        ]
        assert not sphinx.cmd.build.build_main(args), (
            f"The command sphinx-build {' '.join(args)} failed "
            "(See above for details)")

    def package_html(self):
        archive = zipfile.ZipFile(
            os.path.join(self.install_dir, self.html_filename), "w",
            zipfile.ZIP_DEFLATED)
        excluded_dirs = [".doctrees", "_sources"]
        excluded_files = ["objects.inv", ".buildinfo"]
        for root, _, files in os.walk(self.output_guide_dir):
            if any([root.endswith(ex) for ex in excluded_dirs]):
                continue

            # Remove docs/html/ prefix
            new_root = root.replace(self.output_html_dir,
                                    "")[1:]  # Remove leading '/'

            for file in files:
                if file in excluded_files:
                    continue
                archive.write(os.path.join(root, file),
                              arcname=os.path.join(new_root, file))
        archive.close()
        logger.info("%s was successfully generated", self.html_filename)

    def build_pdf(self):
        self.assert_poptorch_in_path()
        args = self.common_sphinx_flags + [
            "-b", "latex", "-D", f"project={self.doc_name}", "-D",
            f"release=v{self.pkg_info.version_long}", "-D",
            f"version=v{self.pkg_info.version_long}", self.docs_src_dir,
            self.output_pdf_dir
        ]
        os.environ["DOC_TITLE"] = self.title
        assert not sphinx.cmd.build.build_main(args), (
            f"The command sphinx-build {' '.join(args)} failed "
            "(See above for details)")
        subprocess.check_output(["make", "LATEXMKOPTS=\"-silent\""],
                                cwd=self.output_pdf_dir)
        shutil.copyfile(os.path.join(self.output_pdf_dir, "doc.pdf"),
                        os.path.join(self.install_dir, self.pdf_filename))
        logger.info("%s was successfully generated", self.pdf_filename)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--no-pdf",
                        action="store_true",
                        help="Do not generate the PDF documentation")
    parser.add_argument("--no-html",
                        action="store_true",
                        help="Do not generate the HTML documentation")
    parser.add_argument("--debug",
                        "-d",
                        action="store_true",
                        help="Print debug messages")
    parser.add_argument("--add-to-sys-path", help="Path to add to sys.path")
    parser.add_argument("--install-dir",
                        help="Copy generated files to that folder")

    args = parser.parse_args()

    logging_level = logging.DEBUG if args.debug else logging.INFO
    logging.basicConfig(level=logging_level)
    logger.debug("Args: %s", str(args))

    if args.add_to_sys_path:
        for path in args.add_to_sys_path.split(";"):
            logger.debug("Adding %s", path)
            sys.path.insert(0, path)

    poptorch_builder = DocumentationBuilder(
        _utils.PkgInfo.load_from_file(must_exist=False),
        install_dir=args.install_dir)

    poptorch_geometric_builder = DocumentationBuilder(
        _utils.PkgInfo.load_from_file(must_exist=False),
        install_dir=args.install_dir,
        poptorch_geometric=True)

    if not args.no_pdf:
        poptorch_builder.build_pdf()
        poptorch_geometric_builder.build_pdf()

    if not args.no_html:
        poptorch_builder.build_html()
        poptorch_builder.package_html()

        poptorch_geometric_builder.build_html()
        poptorch_geometric_builder.package_html()


================================================
FILE: scripts/download_external_datasets.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import argparse
import os.path as osp
import torch_geometric as pyg

parser = argparse.ArgumentParser(description="Download external datasets")
parser.add_argument(
    "external_datasets_dir",
    help="The directory where the external datasets will be downloaded.")

args = parser.parse_args()

pyg.datasets.QM9(root=osp.join(args.external_datasets_dir, "qm9"))
pyg.datasets.Planetoid(osp.join(args.external_datasets_dir, "planetoid"),
                       "Cora")


================================================
FILE: scripts/enable.sh.in
================================================
#!/bin/bash
export PYTHONPATH=@CMAKE_INSTALL_PREFIX@:$PYTHONPATH
@ENABLE_POPLAR_CMD@
@ENABLE_POPART_CMD@


================================================
FILE: scripts/generate_poppyg_package.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import argparse
import datetime
import os
import tempfile
import subprocess
import shutil
import distutils.util
import distutils.dir_util
import utils._utils as utils
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag

targets = ['bdist_wheel', 'sdist', 'install']

parser = argparse.ArgumentParser()
parser.add_argument('--python-dir',
                    help='Path to the folder containing the python files')
parser.add_argument('target',
                    choices=targets,
                    help=f'Which target to build: {targets}')
parser.add_argument('--output-dir',
                    default='dist',
                    help='Where to create the packages')
args = parser.parse_args()

PROJ_NAME = 'poptorch_geometric'
src_dir = os.path.join(utils.sources_dir(), PROJ_NAME)
output_dir = os.path.realpath(args.output_dir)
python_dir = os.path.realpath(args.python_dir)

VERSION = utils.PkgInfo.load_from_file(must_exist=False,
                                       path='..').version_long

# https://www.python.org/dev/peps/pep-0425/
# The platform tag is simply distutils.util.get_platform() with all hyphens - and periods . replaced with underscore _.
PLATFORM = distutils.util.get_platform().replace('.', '_').replace('-', '_')


def find_requirement(package):
    with open(os.path.join(src_dir, 'requirements.txt'), 'r') as f:
        for line in f:
            if package in line:
                return line.strip()

    return None


def get_pyg_hosted_dependency(pkg_name):
    name_and_version = find_requirement(pkg_name)
    assert name_and_version is not None, f'{pkg_name} not found.'

    # For sdist packages we don't know ahead of time what the python version
    # will be, and there is no support for --find-links so we just have to
    # use the regular wheel instead.
    if args.target != "bdist_wheel":
        return name_and_version

    pkg_ver = name_and_version.split('=')[-1]
    file_name = pkg_name.replace('-', '_')
    pkg_whl = f'{pkg_name} @ https://data.pyg.org/whl/torch-2.0.0%2Bcpu/{file_name}-{pkg_ver}-{get_abbr_impl()}{get_impl_ver()}-{get_abi_tag()}-{PLATFORM}.whl'

    return pkg_whl


PYG_DEPENDENCY = find_requirement('torch-geometric') or find_requirement(
    'pyg-nightly')

if PYG_DEPENDENCY is None:
    raise RuntimeError('"torch-geometric" not found in requirements.txt')

SCATTER_DEPENDENCY = get_pyg_hosted_dependency('torch-scatter')
SPARSE_DEPENDENCY = get_pyg_hosted_dependency('torch-sparse')

POPTORCH_DEPENDENCY = f'poptorch=={VERSION}'


def configure(src_filename, dst_filename):
    with open(dst_filename, 'w') as f:
        for line in open(src_filename):
            f.write(
                line.replace('@VERSION@', VERSION) \
                    .replace('@PYG_DEPENDENCY@', PYG_DEPENDENCY) \
                    .replace('@POPTORCH_DEPENDENCY@', POPTORCH_DEPENDENCY) \
                    .replace('@PLATFORM@', PLATFORM) \
                    .replace('@TORCH_SCATTER_DEPENDENCY@', SCATTER_DEPENDENCY) \
                    .replace('@TORCH_SPARSE_DEPENDENCY@', SPARSE_DEPENDENCY)
            )


# Create a temporary directory and copy the files to package to it.
with tempfile.TemporaryDirectory() as tmp_dir:
    os.chdir(tmp_dir)
    shutil.copytree(python_dir, PROJ_NAME)
    shutil.copy(os.path.join(src_dir, 'MANIFEST.in'), '.')
    shutil.copy(os.path.join(src_dir, 'License.txt'), '.')
    shutil.copy(os.path.join(src_dir, 'setup.cfg'), '.')

    configure(os.path.join(src_dir, 'setup.py'), 'setup.py')

    env = {**os.environ}
    start = datetime.datetime.now()

    if args.target == 'install':
        subprocess.check_call(
            f'python3 setup.py build_ext -b {output_dir}'.split(), env=env)
    else:
        extra_opts = ''
        if args.target == 'sdist':
            extra_opts = '--formats=zip'
        subprocess.check_call(
            f'python3 setup.py {args.target} -d {output_dir} {extra_opts}'.
            split(),
            env=env)

    print(f'Time to generate {args.target} in {output_dir} : '
          f'{datetime.datetime.now()-start}')


================================================
FILE: scripts/generate_python_package.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import argparse
import datetime
import os
import tempfile
import subprocess
import shutil
import distutils.util
import distutils.dir_util
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
import utils._utils as utils

targets = ["bdist_wheel", "sdist", "install"]

parser = argparse.ArgumentParser()
parser.add_argument("--python-dir",
                    default="include",
                    help="Path to the folder containing the python files")
parser.add_argument(
    "--include-dir",
    default="include",
    help="Path to the include folder needed to compile the wheel")
parser.add_argument(
    "--lib-dir",
    default="lib",
    help=
    "Path to the folder containing the libraries needed to compile the wheel")
parser.add_argument(
    "--standalone",
    help=("Colon separated list of folders to add to the lib folder of the "
          "sdist / wheel package"))
parser.add_argument("target",
                    choices=targets,
                    help=f"Which target to build: {targets}")
parser.add_argument("--output-dir",
                    default="dist",
                    help="Where to create the packages")
args = parser.parse_args()


def get_version_from_requirements(package):
    with open(os.path.join(src_dir, 'requirements.txt'), 'r') as f:
        for line in f:
            if package in line and not 'cpu' in line:
                name_and_version = line.split(';')[0].split('=')
                return name_and_version[-1].strip()

    return None


def get_torch_dependency(package, version):
    if "aarch64" in utils.get_arch_type():
        # There is no +cpu variant of Torch on Arm
        return f'{package}=={version}'
    # For sdist packages we don't know ahead of time what the python version
    # will be, and there is no support for --find-links so we just have to
    # use the regular torch wheel instead.
    if args.target != "bdist_wheel":
        return f'{package}=={version}'
    return f"{package} @ https://download.pytorch.org/whl/cpu/{package}-{version}%2Bcpu-{get_abbr_impl()}{get_impl_ver()}-{get_abi_tag()}-{PLATFORM}.whl"


def get_poptorch_version():
    version = utils.PkgInfo.load_from_file(must_exist=False,
                                           path="..").version_long
    if args.standalone is not None:
        # Only 1 "+" symbol allowed per version
        separator = "+" if "+" not in version else "_"
        version += separator + "standalone"
    return version


VERSION = get_poptorch_version()

# https://www.python.org/dev/peps/pep-0425/
# The platform tag is simply distutils.util.get_platform() with all hyphens - and periods . replaced with underscore _.
PLATFORM = distutils.util.get_platform().replace(".", "_").replace("-", "_")

torch_ver = utils.get_required_torch_version()
TORCH_DEPENDENCY = get_torch_dependency('torch', torch_ver)

src_dir = utils.sources_dir()
# torch{audio, vision} are added here to prevent the torch upgrade when other
# packages depend on torch{audio, vision}.
torchaudio_ver = get_version_from_requirements('torchaudio')
TORCHAUDIO_DEPENDENCY = get_torch_dependency('torchaudio', torchaudio_ver)

torchvision_ver = get_version_from_requirements('torchvision')
TORCHVISION_DEPENDENCY = get_torch_dependency('torchvision', torchvision_ver)


# Only keep files of a given extension
class ExtOnly:
    def __init__(self, *ext):
        self.ext = ext

    def _is_ignored(self, file):
        return not any(file.endswith(ext) for ext in self.ext)

    def __call__(self, adir, filenames):
        # Return the files to ignore
        return [f for f in filenames if self._is_ignored(f)]


include_dir = os.path.realpath(args.include_dir)
lib_dirs = [os.path.realpath(args.lib_dir)]
if args.standalone is not None:
    lib_dirs += [os.path.realpath(l) for l in args.standalone.split(":")]
output_dir = os.path.realpath(args.output_dir)
python_dir = os.path.realpath(args.python_dir)


def configure(src_filename, dst_filename):
    with open(dst_filename, "w") as f:
        for line in open(src_filename):
            f.write(
                line.replace("@VERSION@", VERSION) \
                    .replace("@PLATFORM@", PLATFORM) \
                    .replace("@TORCH_DEPENDENCY@", TORCH_DEPENDENCY) \
                    .replace("@TORCHAUDIO_DEPENDENCY@", TORCHAUDIO_DEPENDENCY) \
                    .replace("@TORCHVISION_DEPENDENCY@", TORCHVISION_DEPENDENCY)
            )


# Create a temporary directory and copy the files to package to it.
with tempfile.TemporaryDirectory() as tmp_dir:
    os.chdir(tmp_dir)
    shutil.copytree(os.path.join(src_dir, "python"),
                    "src",
                    ignore=ExtOnly(".cpp"))
    shutil.copytree(python_dir, "poptorch")
    # distutils won't throw an exception if the destination already exists,
    # which will happen if lib_dirs contains more than one element.
    for lib_dir in lib_dirs:
        distutils.dir_util.copy_tree(lib_dir, "poptorch/lib")
    shutil.copytree(include_dir, "include")
    shutil.copy(os.path.join(src_dir, "MANIFEST.in"), ".")
    shutil.copy(os.path.join(src_dir, 'setup.cfg'), '.')
    shutil.copy(os.path.join(src_dir, 'License.txt'), '.')
    shutil.copy(os.path.join(src_dir, 'poptorch_third_party_licenses.txt'),
                '.')

    configure(os.path.join(src_dir, "setup.py"), "setup.py")
    configure(os.path.join(src_dir, "pyproject.toml"), "pyproject.toml")

    # distutils doesn't like spaces in CXX (https://github.com/mapnik/python-mapnik/issues/99#issuecomment-527591113)
    env = {**os.environ}
    cc = env.get("CC", "gcc")
    cxx = env.get("CXX", "g++")
    # Only keep the real compiler: e.g "cmake gcc" -> "gcc"
    cc = cc.split(" ")[-1]
    cxx = cxx.split(" ")[-1]
    env["CXX"] = cxx
    env["CC"] = cc
    start = datetime.datetime.now()
    if args.target == "install":
        subprocess.check_call(
            f"python3 setup.py build_ext -b {output_dir}".split(), env=env)
        dst_dir = f"{output_dir}/poptorch/lib"
        if os.path.isdir(dst_dir):
            shutil.rmtree(dst_dir)
        shutil.copytree("poptorch/lib", dst_dir, ignore=ExtOnly(".so"))
    else:
        extra_opts = ""
        if args.target == "sdist":
            extra_opts = "--formats=zip"
        subprocess.check_call(
            f"python3 setup.py {args.target} -d {output_dir} {extra_opts}".
            split(),
            env=env)
    print(f"Time to generate {args.target} in {output_dir} : "
          f"{datetime.datetime.now()-start}")


================================================
FILE: scripts/popgen/__init__.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import enum
import sys
from popgen import onnx

onnx.init()
onnx.parse_signatures()


class PtrOrRef(enum.Enum):
    PTR = 0
    REF = 1


# Root class for all expressions - the result of applying an operator
# to a list of arguments
class Value:
    def __init__(self, op, args, const=False, ptr_or_ref=None):
        assert isinstance(args, list), \
               "args should be a list in Value::__init__"

        self.op = op
        self.args = args
        self.cname = ""
        self.graph_arity = None
        self.annotation = []
        self.const = const
        self.ptr_or_ref = ptr_or_ref

        # perform dynamic casting for literals - makes for nice syntax
        for i, arg in enumerate(args):
            if isinstance(arg, float):
                self.args[i] = ConstantFloat(arg)

        # emit tensor parameters in an initilizer list
        self.tensor_braces = True

    # operator overloading - syntax sugar
    # note that we can't support __eq__ -- it would make the object unhashable
    def __add__(self, other):
        return Value('add', [self, other])

    def __ge__(self, other):
        return Value('logical_or', [self > other, self.equal(other)])

    def __gt__(self, other):
        return Value('greater', [self, other])

    def __le__(self, other):
        return Value('logical_or', [self < other, self.equal(other)])

    def __lt__(self, other):
        return Value('less', [self, other])

    def __mul__(self, other):
        return Value('mul', [self, other])

    def __ne__(self, other):
        return Value('logical_not', [self.equal(other)])

    def __neg__(self):
        return Value('neg', [self])

    def __sub__(self, other):
        return Value('sub', [self, other])

    def __truediv__(self, other):
        return Value('div', [self, other])

    def __radd__(self, other):
        return Value('add', [other, self])

    def __rmul__(self, other):
        return Value('mul', [other, self])

    def __rsub__(self, other):
        return Value('sub', [other, self])

    def __rtruediv__(self, other):
        return Value('div', [other, self])

    def equal(self, other):
        return Value('equal', [self, other])

    def set_graph_arity(self, arity):
        self.graph_arity = arity

    def annotate(self, annot):
        self.annotation.append(annot)

    # emit(values, val_id, tabs, f, root)
    #
    # Emits C++ code for this value
    # Parameters:
    #   values - map of previously generated Value objects and their C++ images (Value -> string)
    #   val_id - the index of the first available temp variable
    #   tabs - indentation string
    #   f - output stream
    #   root - True: we should generate a return statement
    # Returns: index of the next available temp variable
    def emit(self, values, val_id, tabs, f=sys.stdout, root=False):
        if self in values:
            return val_id

        val_id = self.emit_arguments(values, val_id, tabs, f)
        self.emit_annotations(tabs, f)

        # split tensor and non-tensor arguments
        if not self.args or isinstance(self.args[0], NonTensorValue):
            tensors = []
            non_tensors = [values[arg] for arg in self.args]
            self.tensor_braces = False
        else:
            last_tensor = next(arg for arg in reversed(self.args)
                               if not isinstance(arg, NonTensorValue))
            last_tensor = len(self.args) - self.args[::-1].index(last_tensor)
            tensors = [values[arg] for arg in self.args[:last_tensor]]
            non_tensors = [values[arg] for arg in self.args[last_tensor:]]

        suffix = ";\n"
        if not root:
            suffix = "->output();\n"

        val_id = self.emit_assign_return(values,
                                         val_id,
                                         root,
                                         tabs,
                                         f,
                                         ptr_or_ref=PtrOrRef.PTR)
        left_brace = ["{"] if self.tensor_braces else []
        right_brace = ["}"] if self.tensor_braces else []

        if self.op is None:
            f.write("nullptr" + suffix)
        else:
            capital_op = self.op[0].upper() + self.op[1:]
            self.emit_call("create" + capital_op, ["graph"] + left_brace +
                           tensors + right_brace + non_tensors, suffix, f)

        return val_id

    # emit_arguments(values, val_id, tabs, f)
    #
    # Emits C++ code for the arguments this value
    # Parameters:
    #   values - map of previously generated Value objects and their C++ images (Value -> string)
    #   val_id - the index of the first available temp variable
    #   tabs - indentation string
    #   f - output stream
    # Returns: index of the next available temp  variable
    def emit_arguments(self, values, val_id, tabs, f):
        for arg in self.args:
            val_id = arg.emit(values, val_id, tabs, f, False)
        return val_id

    # emit_annotations(tabs, f)
    #
    # Emits annotations as C++ comments
    # Parameters:
    #   tabs - indentation string
    #   f - output stream
    def emit_annotations(self, tabs, f):
        for annot in self.annotation:
            f.write(tabs + annot + "\n")

    # emit_assign_return(values, val_id, root, tabs, f)
    #
    # Emits either an assignment or a return statement
    # Parameters:
    #   values - map of previously generated Value objects and their C++ images (Value -> string)
    #   val_id - the index of the first available temp variable
    #   tabs - indentation string
    #   f - output stream
    # Returns: index of the next available temp  variable
    def emit_assign_return(self,
                           values,
                           val_id,
                           root,
                           tabs,
                           f,
                           const=False,
                           ptr_or_ref=None):
        if root:
            f.write(tabs + "return ")
            return val_id

        if isinstance(val_id, str):
            values[self] = val_id
        else:
            values[self] = "t" + str(val_id)
            val_id += 1

        pr_qual = ""
        if ptr_or_ref == PtrOrRef.PTR:
            pr_qual = "*"
        elif ptr_or_ref == PtrOrRef.REF:
            pr_qual = "&"

        const_qual = ""
        if const:
            const_qual = "const "

        f.write(tabs + const_qual + "auto " + pr_qual + values[self] + " = ")
        return val_id

    # emit_call(fname, args, suffix, f)
    #
    # Emit a function call
    # Parameters:
    #   fname - function name
    #   args - arguments as list of strings
    #   suffix - string to prepend after call
    #   f - output stream
    def emit_call(self, fname, args, suffix, f):
        f.write(fname + "(")
        for (i, arg) in enumerate(args):
            if i > 0:
                if arg not in ["}"] and args[i - 1] not in ["{"]:
                    f.write(", ")
            f.write(arg)
        f.write(')' + suffix)

    # vn()
    #
    # Return a value number for this object
    # Returns: tuple(operator, value numbers of arguments)
    def vn(self):
        return tuple([self.op] + [arg.vn() for arg in self.args])

    # same(other)
    #
    # Returns True if the other operator is a potential match for this one
    def same(self, other):
        return self.op == other.op

    # render()
    #
    # Returns a string image of this object. Used for C++ annotations.
    def render(self):
        if self.op is None:
            return "<pass through>"
        string = self.op + '('
        for i, arg in enumerate(self.args):
            if i > 0:
                string += ', '
            string += arg.render()
        return string + ')'


# ConstantFloat(val)
#
# Represents a constant floating point value to be used as tensor argument
# Parameters:
#   val - the floating point constant
class ConstantFloat(Value):
    def __init__(self, val):
        Value.__init__(self, 'float', [])
        self.val = val

    def emit(self, values, val_id, tabs, f=sys.stdout, root=False):
        if self in values:
            return val_id

        suffix = ";\n"
        if not root:
            suffix = "->output();\n"

        if len(self.args) > 0:
            val_id = self.emit_arguments(values, val_id, tabs, f)
            val_id = self.emit_assign_return(values,
                                             val_id,
                                             root,
                                             tabs,
                                             f,
                                             ptr_or_ref=PtrOrRef.PTR)
            self.emit_call(
                "createConstantFloatLike",
                ["graph", values[self.args[0]], "{",
                 str(self.val), "}", "{}"], suffix, f)
        else:
            val_id = self.emit_assign_return(values,
                                             val_id,
                                             root,
                                             tabs,
                                             f,
                                             ptr_or_ref=PtrOrRef.PTR)
            self.emit_call(
                "createConstantFloat32",
                ["graph", "{", str(self.val), "}", "{}"], suffix, f)
        return val_id

    def vn(self):
        return str(self.val)

    def same(self, other):
        return other.op == 'float' and self.val == other.val

    def render(self):
        return str(self.val)


# NonTensorValue(op, args)
#
# Root class for non-tensor values.
# Parameters:
#   op - operator
#   args - arguments
class NonTensorValue(Value):
    def __init__(self, op, args):
        Value.__init__(self, op, args)


================================================
FILE: scripts/popgen/api.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved
import inspect
from popgen import generator, registry, values
from popgen.operatorfactory import op


# convert(aten, arity, popop=None, swizzles=None)
#
# Registers a conversion rule.
# Parameters:
#   aten - name of the operator to be converted
#   arity - number of inputs of aten
#   popop - popART operator to be generated (None: same as aten)
#   swizzles - list of integer indices representing a permutation of inputs
def convert(aten, arity, popop=None, swizzles=None):
    if popop is None:
        popop = aten

    if swizzles is None:
        swizzles = range(0, arity)

    inputs = []
    for swz in swizzles:
        assert isinstance(swz, int) and swz in range(0, arity), \
            "Illegal swizzle for " + aten
        inputs.append(values.InputValue("i" + str(swz), swz))

    fn = getattr(op, popop)
    registry.add_handler(aten, fn(*inputs), arity)


# expand(aten, fn)
#
# Registers an expansion rule
# Parametrs:
#   aten - name of operator to be expanded
#   fn - function defining the expansion
def expand(aten, fn):
    return registry.expand(aten, fn)


# forward(source, dest)
#
# Registers a forwarding rule. Effect is to forward one operator to the
# handlers of another.
# Parameters:
#   source - name of forwarded operator
#   dest - name of operator whoose handlers are to be used
def forward(source, dest):
    assert source not in registry.forwardings, \
        source + " is forwarded twice"
    registry.forwardings[source] = dest


# generate(namespace, filename)
#
# Generate C++ code.
# Parameters:
#   script - name of the top-level script
#   namespace - the namespace of the operators
#   filename - file to write the code to
#   global_symbols - dictionary of global_symbols from top-level
def generate(script, namespace, filename, global_symbols=globals()):
    generator.generate(script, namespace, filename, global_symbols)
    print("File successfully generated, remember to run "
          "'./scripts/apply_linters.py -a " + filename +
          "' before checking the file in")


# simplify(name, fn)
#
# Registers a simplification rule.
# Parameters:
#   operator_name - name of the operator to be greated as a string
#   fn - function defining the expression to be matched
def simplify(name, fn):
    # computes the weight of the expression (i.e. the number of values involved
    # in the pattern). The matched will use this to break ties - the heviest
    # pattern is preferred
    def weight(value):
        result = 1
        for arg in value.args:
            result += weight(arg)
        return result

    inputs = []
    ops = inspect.signature(fn).parameters
    for idx, op in enumerate(ops):
        inputs.append(values.InputValue(op, idx))

    pattern = fn(*inputs)
    registry.complex_ops[name] = (weight(pattern), pattern)


================================================
FILE: scripts/popgen/generator.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved
import datetime
import sys

import re

from popgen import registry, transform


# emit_handlers(namespace, aten, handlers, f=sys.stdout)
#
# Emits the C++ handlers for one operator.
# Parameters:
#   namespace - namespace the operator is in
#   aten - name of the operator
#   handlers - list of handlers. must differ in arity.
#   f - output stream
def emit_handlers(namespace, aten, handlers, f=sys.stdout):
    values = dict()
    opname = get_camel_case_op_name(aten)
    emit_arity_check = len(handlers) > 1

    decl = "torch::jit::Node *" + opname +  "Handler(" + \
           "torch::jit::Graph *graph, " + "torch::jit::Node *node) {"
    if len(decl) <= 80:
        f.write(decl + "\n")
    else:
        decl = "torch::jit::Node *" + opname + "Handler("
        f.write(decl + "torch::jit::Graph *graph,\n")
        f.write(" " * len(decl))
        f.write("torch::jit::Node *node) {\n")

    arities = set()
    for handler in handlers:
        assert handler.graph_arity not in arities, \
               aten + " has multiple handlers with the same arity"
        arities.add(handler.graph_arity)
        values.clear()
        handler = transform.generate_complex_ops(handler)
        handler = transform.value_numbering(handler)
        handler = transform.generate_typed_constants(handler)
        handler.annotate("// " + handler.render())

        if emit_arity_check:
            f.write("  if (node->inputs().size() == " +
                    str(handler.graph_arity) + ") {\n")
            handler.emit(values, 0, "    ", f, True)
            f.write("  }\n")
        else:
            handler.emit(values, 0, "  ", f, True)

    if emit_arity_check:
        arity_list = sorted(list(arities))
        expect_str = "Expecting " + str(arity_list[0])

        for i in range(1, len(arity_list) - 1):
            expect_str += ', ' + str(arity_list[i])

        if len(arity_list) > 1:
            expect_str += ' or ' + str(arity_list[-1])

        if len(arity_list) > 1 or arity_list[0] > 1:
            expect_str += " operands, "
        else:
            expect_str += " operand, "

        f.write('\n  std::stringstream errmsg;\n')
        f.write('  errmsg << "Incorrect number of arguments for operator ";\n')
        f.write('  errmsg << "' + namespace + '::' + aten + '. ";\n')
        f.write('  errmsg << "' + expect_str + '";\n')
        f.write(
            '  errmsg << "got " << node->inputs().size() << " operand(s).";\n')
        f.write("  ERROR(&errmsg);\n")
        f.write("  return nullptr;\n")

    f.write("}\n\n")


# generate(script, namespace, filename, global_symbols)
#
# Generate a file containg C++ implementation of handlers
# Parameters:
#   script - name of top-level script
#   namespace - the namespace the operators are in
#   filename - the output fil
#   global_symbols - dictionary of globals from top-level
def generate(script, namespace, filename, global_symbols):
    f = open(filename, 'w')

    now = datetime.datetime.now()
    f.write('// DO NOT EDIT! Generated by ' + script + '\n')
    f.write('// Copyright (c) ' + str(now.year) +
            ' Graphcore Ltd. All rights reserved.\n\n')

    f.write('#include "../PoptorchStaticInit.hpp"\n')
    f.write('#include "../PoptorchSymbols.hpp"\n')
    f.write('#include "PopartCanonicalizationUtils.hpp"\n')
    f.write('#include "poptorch/OpBuilder.hpp"\n')
    f.write('#include "poptorch/Utils.hpp"\n')
    f.write('#include "poptorch_logging/Error.hpp"\n')
    f.write('#include "poptorch_logging/Logging.hpp"\n')

    f.write("\nnamespace poptorch {\n")
    f.write("\nnamespace {\n\n")

    registry.add_implicit_handlers(global_symbols)
    for (aten, handler) in sorted(registry.handlers.items()):
        emit_handlers(namespace, aten, handler, f)

    f.write("} // namespace\n")

    f.write("\n__attribute__((constructor(HANDLER_INIT_PRIORITY))) ")
    f.write("static void registration() {\n")

    for (source, _) in registry.forwardings.items():
        transform.validate_forwarding(source)

    to_register = sorted(
        list(registry.handlers.keys()) + list(registry.forwardings.keys()))
    for aten in to_register:
        opname = get_camel_case_op_name(registry.forwardings.get(aten, aten))
        reg_handler_line = ("  registerHandler(" + namespace + "::" + aten +
                            ", " + opname + "Handler);\n")
        if len(reg_handler_line) > 81:
            reg_handler_line = reg_handler_line.replace(
                ", ", ",\n                  ")
        f.write(reg_handler_line)
    f.write("}\n\n")

    f.write("} // namespace poptorch\n")
    f.close()

    registry.clear()


# get_op_name(aten)
#
# Returns the name of the C++ handler function for an operator
# Parameters:
#   aten - the name of the operator
def get_op_name(aten):
    opname = aten.split(':')[-1]
    return opname


def snake_to_camel_case(snake_case):
    rx = re.compile(r"_[A-z]")
    return re.sub(rx, lambda s: s.group(0).upper()[1], snake_case)


def get_camel_case_op_name(aten):
    rx = re.compile(r"_$")
    return re.sub(rx, "InPlace", snake_to_camel_case(get_op_name(aten)))


================================================
FILE: scripts/popgen/helpers.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved
from popgen import PtrOrRef, values


# alpha(m, a):
#
# Generate the alpha computation required for operators that have implicit scaling
# Parameters:
#   m - quantity to be scaled
#   a - scaling factor
def alpha(m, a):
    return values.AlphaValue([m, a])


# as_ir(v)
#
# Helper that returns a vector of ints as IR constant
# Parameters:
#   v - the input vector
def as_ir(v):
    return values.Helper('AsIr', [v],
                         'intVectorToIrConstant',
                         needs_graph=True,
                         ptr_or_ref=PtrOrRef.PTR)


# cint(n)
#
# Generate an integer as a C int literal or variable
# Parameters:
#   n - value to be generated
def cint(n):
    return values.NonTensorConstant('cint', n, 'constantToInt')


# clong(n)
#
# Generate an integer as a C long literal or variable
# Parameters
#   n - value to be generated
def clong(n):
    return values.NonTensorConstant('clong', n, 'constantToLong')


# clong(l)
#
# Generate a value as a list of C longs
# Parameters
#   l - value to be generated
def clong_list(l):
    return values.NonTensorHelper('clong_list', [l],
                                  'constantToLongVec',
                                  expects_node=True)


# cfloat(f)
#
# Generate a floating point as a C float literal or variable
# Parameters
#   f - value to be generated
def cfloat(f):
    return values.NonTensorConstant('cfloat', f, 'constantToFloat')


# cstr(s)
#
# Generate a string as a C string literal or variable
# Parameters
#   s - value to be generated
def cstr(s):
    return values.NonTensorConstant('cstr', s, 'constantToString')


# dimension(a, t)
#
# Helper for parameters that are dimensional indices
# Parameters:
#   v - value representing a dimensional index
#   t - tensor type
def dimension(v, t):
    return values.NonTensorHelper('dimension', [v, t], 'handleDimensionParam')


# dimension_list(t, a)
#
# Produces a list with the dimensions of a tensor. Needed for some
# reduction operators.
# Parameters:
#   t - input tensor
#   a - axes vector (optional)
def dimension_list(t, a=None):
    args = [t, a] if a is not None else [t]
    return values.NonTensorHelper('dimension_list', args,
                                  "reduceHelperDimensionCreator")


# empty_initializer()
#
# Helper that produces an empty initializer list
def empty_initializer():
    return values.EmptyInitializer()


# output_shape(index = 0)
#
# Generate a tensor shape for the output value.
# Parameters
#   index - index of output (default: 0)
def output_shape(idx=0):
    return tensor_shape(values.OutputValue(idx))


# output_type(index = 0)
#
# Generate the expected scalar type for the output value.
# Parameters
#   index - index of output (default: 0)
def output_type(idx=0):
    return scalar_type(values.OutputValue(idx))


# reduction(r)
#
# Converts reduction type from pytorch to popart
# Parameters:
#   r - integer containing reduction Id
def reduction(r):
    return values.NonTensorHelper('reduction', [r], 'convertReduceToPopart')


# tensor_list(l)
#
# Generate a list of tensors
# Parameters
#   l - value to be generated
def tensor_list(l):
    return values.Helper('TensorList', [l], "handleTensorList", True)


# tensor_long(t)
#
# Change the scalar type of a tensor to Long
# Parameters
#   s - input tensor
def tensor_long(t):
    return values.CastInPlace("inplace_cast<long>", [t],
                              'at::ScalarType::Long')


# tensor_shape(t)
#
# Generate the shape of a tensor as a C++ vector of ints
# Parameters
#   t - input tensor
def tensor_shape(t):
    return values.NonTensorHelper('tensor_shape', [t], "shapeFromTensor")


# tensor_type(t)
#
# Generate the tensor type of the input
# Parameters:
#   t - the input tensor
def tensor_type(t):
    return values.TensorType(t)


# scalar_type(t)
#
# Return the scalar type of a tensor
# Parameters
#   t - input tensor
def scalar_type(t):
    return values.NonTensorHelper('scalar_type', [t], 'getNodeScalarType')


================================================
FILE: scripts/popgen/onnx.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved
import logging
import json
import os
import re
from ctypes.util import find_library
import clang.cindex

current_dir = os.path.dirname(os.path.realpath(__file__))
logger = logging.getLogger('OnnxParser')

poplar_include_dir = None
popart_include_dir = None
popart_files = ["builder.hpp", "builder.gen.hpp"]

nodeBlacklist = {
    "DomainOpSet", "Builder", "getOpsetVersion", "AiOnnxOpset11",
    "AiOnnxOpset12"
}


# find_popart_includes(path=None)
#
# Validate path to popart include files
def find_popart_includes():
    assert "CONDA_PREFIX" in os.environ, ("You need to run this script from "
                                          "inside an activated buildenv")
    compile_commands = os.path.realpath(
        os.path.join(os.environ["CONDA_PREFIX"], "..",
                     "compile_commands.json"))
    assert os.path.isfile(compile_commands), (
        "You need to configure your build "
        "by running cmake")
    with open(compile_commands, "r") as f:
        cmds = json.load(f)
    regex = r'.*-isystem (.*popart.*?(/install)?/include) ?.*'
    for c in cmds:
        if "popart_compiler" in c["file"]:
            m = re.match(regex, c["command"])
            if not m:
                continue
            return m.group(1)

    raise RuntimeError(
        "Failed to find path to PopART in compile_commands.json")


def find_poplar_includes():
    assert "CONDA_PREFIX" in os.environ, ("You need to run this script from "
                                          "inside an activated buildenv")
    compile_commands = os.path.realpath(
        os.path.join(os.environ["CONDA_PREFIX"], "..",
                     "compile_commands.json"))
    assert os.path.isfile(compile_commands), (
        "You need to configure your build "
        "by running cmake")
    with open(compile_commands, "r") as f:
        cmds = json.load(f)
    regex = r'.*-isystem (.*poplar.*?(/install)?/include) ?.*'
    for c in cmds:
        if "popart_compiler" in c["file"]:
            m = re.match(regex, c["command"])
            if not m:
                continue
            return m.group(1)

    raise RuntimeError(
        "Failed to find path to Poplar in compile_commands.json")


# init(popart_path=None, clang_path=None, debug=False):
#
# Initialize parser module and logging object
# Parameters:
#   popart_path - path to popART headers (default: autodetect)
#   clang_path - path to clang shared object (default: autodetect)
#   debug - True: enable debug logging
def init(popart_path=None, poplar_path=None, clang_path=None, debug=False):
    global poplar_include_dir
    global popart_include_dir
    logging_level = logging.DEBUG if debug else logging.INFO
    logging.basicConfig(level=logging_level)

    if popart_path is None:
        popart_include_dir = find_popart_includes()
    else:
        builder_path = os.path.isfile(
            os.path.join(popart_path, "popart", "builder.hpp"))
        assert builder_path, ("Unable to locate popART's popart/builder.hpp "
                              "in " + popart_path)
        popart_include_dir = popart_path

    if poplar_path is None:
        poplar_include_dir = find_poplar_includes()
    else:
        poplar_include_dir = poplar_path
    logger.info('Will pick up poplar headers from: %s', poplar_include_dir)
    logger.info('Will pick up popART headers from: %s', popart_include_dir)
    for (i, fname) in enumerate(popart_files):
        popart_files[i] = os.path.realpath(
            os.path.join(popart_include_dir, "popart", fname))

    if clang.cindex.Config.loaded:
        # Already initialised
        return

    if clang_path is None:
        for version in [9, 8, 7, 6]:
            logger.debug('Trying to find: clang-%s', str(version))
            clang_path = find_library('clang-' + str(version))
            if clang_path is not None:
                break

    assert clang_path is not None, 'Could not find clang'
    logger.info('Will use clang: %s', clang_path)
    clang.cindex.Config.set_library_file(clang_path)


# find_functions(node, namespace=""):
#
# Locate function declarations starting from an AST node
# Parameters:
#   jsonOutput - reference to dictionary functions' dictionary
#   node - the AST node
#   namespace - C++ namespace of declarations
def find_functions(jsonOutput, node, namespace=""):
    # If this is not the file path provided on the comand line, skip.
    if node.location.file is not None and \
            os.path.realpath(str(node.location.file)) not in popart_files:
        return
    if node.spelling in nodeBlacklist:
        return

    if node.kind == clang.cindex.CursorKind.CLASS_DECL:
        namespace = node.spelling

    if node.kind != clang.cindex.CursorKind.CXX_METHOD:
        for child in node.get_children():
            find_functions(jsonOutput, child, namespace)
        return

    functionName = node.spelling
    returnType = str(node.type.spelling).split("(")[0]
    operation = dict()
    operation["type"] = returnType
    operation["args"] = []

    if node.access_specifier != clang.cindex.AccessSpecifier.PUBLIC:
        return

    argNum = 0
    for child in node.get_children():
        argument = {}
        if child.kind != clang.cindex.CursorKind.PARM_DECL:
            continue

        argument["type"] = child.type.spelling
        argument["name"] = child.spelling

        # skip 'name' argument
        if argument['name'] == 'name':
            continue

        argument["num"] = argNum
        operation["args"].append(argument)
        argNum += 1

    if namespace not in jsonOutput:
        jsonOutput[namespace] = {}

    jsonOutput[namespace][functionName] = operation


# parse()
#
# Parse popART header files and extract onnx operator information
# Returns:
#   Map of operators, return types and arguments
def parse():
    index = clang.cindex.Index.create()

    path = os.path.realpath(
        os.path.join(popart_include_dir, "popart", "builder.hpp"))
    logger.info('Parsing: %s', path)
    tu = index.parse(path,
                     args=[
                         "-std=c++14", "-I" + popart_include_dir,
                         "-I" + poplar_include_dir, "-DONNX_NAMESPACE=onnx"
                     ])

    for diag in tu.diagnostics:
        logger.warning(diag)

    json = dict()
    find_functions(json, tu.cursor)

    classes = []
    for name in json:
        if name.startswith("Ai"):
            classes.append(name)
        else:
            del json[name]

    classes.reverse()
    added_functions = set()

    for opset in classes:
        to_remove = []

        for name in json[opset]:
            if name in added_functions:
                to_remove.append(name)
            else:
                added_functions.add(name)

        for name in to_remove:
            json[opset].pop(name)

    return json


signatures = dict()


def parse_signatures():
    json = parse()
    classes = []
    for classname in json:
        classes.append(classname)
    classes.reverse()

    type_map = {
        'char': ['cstr'],
        'bool': ['cint'],
        'float': ['cfloat'],
        'int64_t': ['clong', 'dimension'],
        'int': ['cint'],
        'unsigned int': ['cint'],
        'std::string': ['cstr'],
        'std::vector<float>': ['cfloat_list', 'empty_initializer'],
        'std::vector<int64_t>': ['clong_list', 'empty_initializer'],
        'std::vector<std::string>': ['cstr_list', 'empty_initializer'],
        'nonstd::optional<float>': ['cfloat', 'None'],
        'nonstd::optional<int>': ['cint', 'None'],
        'nonstd::optional<int64_t>': ['clong', 'None'],
        'nonstd::optional<std::string>': ['cstr', 'None'],
        'nonstd::optional<std::vector<int64_t> >':
        ['clong_list', 'dimension_list', 'None'],
        'nonstd::optional<CollectiveOperator>':
        'ignore',
        'nonstd::optional<CommGroup>':
        'ignore',
        'Attributes::Float': ['cfloat'],
        'Attributes::Int': ['clong'],
        'Attributes::Ints': ['clong_list', 'empty_initializer'],
        'popart::ReductionType': ['cint', 'reduction'],
        'popart::ScatterReduction': ['cint', 'scatter_reduction'],
        'popart::Builder':
        'ignore',
        'popart::ConstVoidData':
        'ignore',
        'popart::MultiConvDilations':
        'ignore',
        'popart::MultiConvInputs':
        'ignore',
        'popart::MultiConvPads':
        'ignore',
        'popart::MultiConvStrides':
        'ignore',
        'popart::TensorId':
        'ignore',
        'popart::DebugContext':
        'popart::DebugContext',
    }

    for classname in classes:
        for op in json[classname]:
            args = json[classname][op]['args']

            arglist = []
            for arg in args:
                name = arg['name']
                ty = arg['type'].replace('const ', '').replace(' &', '')

                if name == 'args':
                    arglist.append('Args')
                    continue
                if ty not in type_map:
                    assert False, "Unsupported type " + ty + \
                        " in onnx.parse_signatures()"

                if type_map[ty] != 'ignore':
                    arglist.append(type_map[ty])

            signatures[op] = arglist


================================================
FILE: scripts/popgen/operatorfactory.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved
from popgen import NonTensorValue, Value, onnx, poptorch
from popgen.helpers import empty_initializer


# no_tensor_braces(v):
#
# Modifiers for values that take tensors without initializer list braces
# Parameters:
#   v - the input value
def no_tensor_braces(v):
    v.tensor_braces = False
    return v


# def check_operator_signature(value, signatures)
#
# Verify an operator has correct signature
# Parameters:
#   value - the operator
#   signatures - signatures' dictionary
def check_operator_signature(value, signatures):
    assert value.op in signatures, \
        str(value.op) + " is not a supported operator"

    actual_args = value.args
    expected_args = signatures[value.op]

    # check non-tensor arguments
    first_non_tensor = -1
    if expected_args[0] == 'Args':
        for i, arg in enumerate(actual_args):
            if arg.op == 'empty_initializer':
                continue
            if isinstance(arg, NonTensorValue):
                first_non_tensor = i
                break

        assert first_non_tensor != 0, 'Expecting at least 1 tensor ' + \
            'argument for ' + value.op

    # no non-tensor arguments
    if first_non_tensor == -1:
        return value

    # check non-tensor arguments
    expected_args = expected_args[1:]
    actual_args = actual_args[first_non_tensor:]

    # assume any missing arguments are optional
    for i in range(1, len(expected_args) - len(actual_args)):
        actual_args.append('None')

    for i, arg in enumerate(actual_args):
        if isinstance(arg, Value):
            arg = arg.op
        assert arg in expected_args[i], 'Incorrect operand ' + str(i) + \
            ' for ' + value.op + '. Got ' + arg + ', expecting ' + \
            'one of: ' + str(expected_args[i])

    return value


# Factory class for creating popArt ops. Operators are created
# on the fly based on spelling of attributes.
class OperatorFactory:
    def __getattr__(self, name):
        if name in onnx.signatures:
            return lambda *args: \
                check_operator_signature(Value(name, list(args)), \
                onnx.signatures)
        if name in poptorch.signatures:
            return lambda *args: \
                check_operator_signature(Value(name, list(args)), \
                poptorch.signatures)
        raise ValueError(name + " is not a supported operator")

    def cast(self, t, ty):
        value = no_tensor_braces(Value('cast', [t, ty]))
        check_operator_signature(value, poptorch.signatures)
        return value

    def internalCast(self, t, ty):
        value = no_tensor_braces(Value('internalCast', [t, ty]))
        check_operator_signature(value, poptorch.signatures)
        return value

    def constantPad(self, x, l, c):
        value = no_tensor_braces(Value('constantPad', [x, l, c]))
        check_operator_signature(value, poptorch.signatures)
        return value

    def edgePad(self, t, l):
        value = no_tensor_braces(Value('edgePad', [t, l]))
        check_operator_signature(value, poptorch.signatures)
        return value

    def printIpuTensor(self, t, s):
        value = no_tensor_braces(Value('printIpuTensor', [t, s]))
        check_operator_signature(value, poptorch.signatures)
        return value

    def callCpuOp(self, t, s, n):
        value = no_tensor_braces(Value('callCpuOp', [t, s, n]))
        check_operator_signature(value, poptorch.signatures)
        return value

    def transpose(self, t):
        value = Value('transpose', [t, empty_initializer()])
        check_operator_signature(value, onnx.signatures)
        return value

    def randomNormal(self, x, shape, high, low, scalar_type=None):
        args = [x, shape, high, low]
        if scalar_type is not None:
            args += [scalar_type]

        value = Value('randomNormal', args)
        check_operator_signature(value, poptorch.signatures)
        return value

    def randomUniform(self, x, shape, high, low, scalar_type=None):
        args = [x, shape, high, low]
        if scalar_type is not None:
            args += [scalar_type]

        value = no_tensor_braces(Value('randomUniform', args))
        check_operator_signature(value, poptorch.signatures)
        return value

    def recomputationCheckpoint(self, x):
        value = no_tensor_braces(Value('recomputationCheckpoint', [x]))
        check_operator_signature(value, poptorch.signatures)
        return value

    def reflectionPad(self, t, l):
        value = no_tensor_braces(Value('reflectionPad', [t, l]))
        check_operator_signature(value, poptorch.signatures)
        return value

    def setAvailableMemory(self, x, y):
        value = no_tensor_braces(Value('setAvailableMemory', [x, y]))
        check_operator_signature(value, poptorch.signatures)
        return value

    def setMatMulSerialization(self, x, s, a, b):
        value = no_tensor_braces(Value('setMatMulSerialization', [x, s, a, b]))
        check_operator_signature(value, poptorch.signatures)
        return value

    def startForLoop(self, inputs):
        value = no_tensor_braces(Value('startForLoop', [inputs]))
        check_operator_signature(value, poptorch.signatures)
        return value

    def endForLoop(self, output, inputs, trip_count):
        value = no_tensor_braces(
            Value('endForLoop', [output, inputs, trip_count]))
        check_operator_signature(value, poptorch.signatures)
        return value

    def startIfBlock(self, condition):
        value = no_tensor_braces(Value('startIfBlock', [condition]))
        return value

    def startElseBlock(self, outputs_then):
        value = no_tensor_braces(Value('startElseBlock', [outputs_then]))
        return value

    def endIfBlock(self, outputs_else, condition):
        value = no_tensor_braces(Value('endIfBlock',
                                       [outputs_else, condition]))
        check_operator_signature(value, poptorch.signatures)
        return value

    def passThrough(self):
        return Value(None, [])


op = OperatorFactory()


================================================
FILE: scripts/popgen/poptorch.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved

# signatures for manually added operators
signatures = {
    'beginIpuBlock': [['clong'], ['clong'], ['clong']],
    'cast': ['Args', ['scalar_type']],
    'internalCast': ['Args', ['cstr']],
    'constantPad': ['Args', ['clong_list'], ['cfloat']],
    'edgePad': ['Args', ['clong_list']],
    'optimizerGroup': [['clong'], ['tensor_list']],
    'printIpuTensor': ['Args', ['cstr']],
    'callCpuOp': [['tensor_list'], ['cstr'], ['node']],
    'randomNormal': [
        'Args', ['tensor_shape'], ['cfloat'], ['cfloat'],
        ['scalar_type', 'None']
    ],
    'randomUniform': [
        'Args', ['tensor_shape'], ['cfloat'], ['cfloat'],
        ['scalar_type', 'None']
    ],
    'recomputationCheckpoint': ['Args'],
    'reflectionPad': ['Args', ['clong_list']],
    'setAvailableMemory': ['Args', ['cfloat']],
    'setMatMulSerialization': ['Args', ['cstr'], ['clong'], ['cint']],
    'startForLoop': ['Args'],
    'endForLoop': ['Args', ['clong']],
    'startIfBlock': ['Args'],
    'startElseBlock': ['Args'],
    'endIfBlock': ['Args'],
}


================================================
FILE: scripts/popgen/registry.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved
import inspect
import re
from popgen import values

# simplification rules. operator_name -> value
complex_ops = dict()

# forwardings. from_operator -> to_operator
forwardings = dict()

# operator handlers. operator_name -> list(value)
handlers = dict()


# add_handler(aten, value, arity)
#
# Register a new handler for an operator
# Parameters:
#   aten - name of operator
#   value - root of the expansion expression
#   arity - number of unique graph nodes taken as input
def add_handler(aten, value, arity):
    if aten not in handlers:
        handlers[aten] = []
    value.set_graph_arity(arity)
    handlers[aten].append(value)


# add_implicit_handlers(global_symbols)
#
# Inspect global namespace dictionary and register function handlers
# Parameters:
#   global_symbols - dictianary of top-level globals
def add_implicit_handlers(global_symbols):
    for name in global_symbols.keys():
        fn = global_symbols[name]
        if not callable(fn):
            continue

        res = re.search('(.+)_handler$', name)
        if res:
            expand(res.group(1), fn)


# clear(clear_complex_ops = False)
#
# Clears all internal dictionaries.
# Parameters:
#   clear_complex_ops - clear complex_ops map (default: False)
def clear(clear_complex_ops=False):
    handlers.clear()
    forwardings.clear()
    if clear_complex_ops:
        complex_ops.clear()


# expand(aten, fn)
#
# Registers an expansion rule
# Parametrs:
#   aten - name of operator to be expanded
#   fn - function defining the expansion
def expand(aten, fn):
    inputs = []
    ops = inspect.signature(fn).parameters
    for idx, op in enumerate(ops):
        inputs.append(values.InputValue(op, idx))

    add_handler(aten, fn(*inputs), len(ops))


================================================
FILE: scripts/popgen/transform.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved

from popgen import registry, Value, ConstantFloat
from popgen.values import InputValue


# generate_complex_ops(value)
#
# Apply simplification rules to the expression rooted at the parameter.
# New values are annotated with the applied transformation.
# Parameters:
#   value - root of an expression
# Returns: root af simplified expression
def generate_complex_ops(value):
    # munch(value, pattern)
    #
    # Attempt to match a pattern to the expression rooted at value
    # Parameters:
    #   value - root of the original expression
    #   pattern - root of the pattern
    # Returns: tuple(match, list([(idx, arg), ...])
    #   match - True / False according to whether matchig was successful
    #   (idx, arg) - arguments of the new complex operator
    #       idx - index of pattern's input node
    #       arg - value that is to become an argument with said index
    def munch(value, pattern):
        if isinstance(pattern, InputValue):
            return (True, [(pattern.num, value)])
        if not pattern.same(value):
            return (False, None)

        match = True
        new_args = []
        for i, _ in enumerate(pattern.args):
            (match, args) = munch(value.args[i], pattern.args[i])
            if not match:
                new_args = None
                break
            new_args += args

        return (match, new_args)

    # Attempt to match patterns in reverse order of weight and stop at
    # first match. Repeat process recursively.
    for name, op in registry.complex_ops.items():
        (_, pattern) = op
        (match, pos_args) = munch(value, pattern)
        if match:
            new_args = [None] * len(pos_args)
            for (pos, arg) in pos_args:
                new_args[pos] = arg

            new_value = Value(name, new_args)
            new_value.annotation = value.annotation
            new_value.annotate("// matched " + name + ": " + pattern.render())
            return generate_complex_ops(new_value)

    value.args = [generate_complex_ops(arg) for arg in value.args]
    return value


# generate_typed_constants(value)
#
# When possible, have constants inherit type information from sibling operands.
# This is achieved by attaching a sibling tensor operand as an argument to the
# constant. The emit function should then produce a creation call that borrows
# type information from the argument.
# Parameters:
#   value - root of the expression tree
# Returns:
#   value - potentially new root node
def generate_typed_constants(value, type_like=None):
    if isinstance(value, ConstantFloat):
        if type_like is not None:
            value.args.append(type_like)
        return value

    # find the first tensor argument
    args = [
        arg for arg in value.args
        if isinstance(arg, Value) and not isinstance(arg, ConstantFloat)
    ]

    # 'where' is a nasty case where the first tensor is bool!
    if len(args) > 0:
        if value.op != 'where':
            type_like = args[0]
        else:
            type_like = args[1]

    for (i, arg) in enumerate(value.args):
        value.args[i] = generate_typed_constants(arg, type_like)

    return value


# value_numbering(value)
#
# Perform value numbering. Any identical values will be merged into a single.
# object. The tree rooted at the parameter becomes and acyclic graph.
# Parameters:
#   value - root of an expression tree
# Returns: potentially new root of an acyclic graph
def value_numbering(value):
    vn = dict()

    def numbered_value(value):
        for i, _ in enumerate(value.args):
            value.args[i] = numbered_value(value.args[i])

        key = value.vn()
        if key in vn:
            return vn[key]
        vn[key] = value
        return value

    return numbered_value(value)


# validate_forwarding(source):
#
# Ensure the forwarding of source is sane and resolve any chained rules by closure.
# Parameters:
#   source - name of operator being forwarded
def validate_forwarding(source):
    visited = set(source)
    dest = registry.forwardings[source]

    assert source not in registry.handlers, \
        source + " is both forwarded and handled"

    while dest not in registry.handlers:
        assert dest in registry.forwardings, \
            source + " forwarded but no handler found"
        assert dest not in visited, source + " has circular forwarding"
        visited.add(dest)
        dest = registry.forwardings[dest]

    registry.forwardings[source] = dest


================================================
FILE: scripts/popgen/values.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved

import sys
from popgen import PtrOrRef, Value, NonTensorValue
from popgen.operatorfactory import op


# AlphaValue(args)
#
# Represents the alpha computation required for operators that perform
# implicit scaling. Its purpose is to avoid a multiplication by unity.
# Parameters:
#   args[0] - value to be scaled
#   args[1] - scaling factor
class AlphaValue(Value):
    def __init__(self, args):
        Value.__init__(
            self, 'alpha',
            [args[0], args[1], op.mul(args[0], args[1])])

    def emit(self, values, val_id, tabs, f=sys.stdout, root=False):
        if self in values:
            return val_id

        val_id = self.emit_arguments(values, val_id, tabs, f)
        self.emit_annotations(tabs, f)
        values[self] = "t" + str(val_id)
        f.write(tabs + "auto *" + values[self] + " = hasUnityValue(" +
                values[self.args[1]] + ") ? " + values[self.args[0]] + " : " +
                values[self.args[2]] + ";\n")
        return val_id + 1


# CastInPlace(op, args, to_type)
#
# Represents the operation of swiching the scalar type of a tensor.
# It is a cast "in-place" in the sense that it doesn't generate
# any casting nodes.
# Parameters:
#   op - name of operator
#   args - tensor input
#   to_type - name of target type
class CastInPlace(Value):
    def __init__(self, op, args, to_type):
        Value.__init__(self, op, args)
        self.to_type = to_type

    def emit(self, values, val_id, tabs, f=sys.stdout, root=False):
        if self in values:
            return val_id
        val_id = self.emit_arguments(values, val_id, tabs, f)
        self.emit_annotations(tabs, f)

        node = "t" + str(val_id)
        f.write(tabs + "auto *" + node + " = " + values[self.args[0]] +
                "->node();\n")
        f.write(f"{tabs}setNodeTensorAttrValue({node}, "
                f"getNodeTensorAttrValue({node}).to({self.to_type}));\n")
        f.write(f"{tabs}{node}->output()->inferTypeFrom("
                f"getNodeTensorAttrValue({node}));\n")

        if not root:
            values[self] = "t" + str(val_id + 1)
            f.write(tabs + "auto *" + values[self] + " = " + node +
                    "->output();\n")
            return val_id + 2
        f.write(tabs + "return " + node + "->output();\n")
        return val_id + 1


# TensorType(t)
#
# Represents the tensor type of a tensor value.
# Parameters:
#   t - the input tensor
class TensorType(NonTensorValue):
    def __init__(self, t):
        NonTensorValue.__init__(self, 'TensorType', [t])
        self.val = t

    def emit(self, values, val_id, tabs, f=sys.stdout, root=False):
        assert not root, "TensorType cannot be a root expression"
        if self in values:
            return val_id

        val_id = self.emit_arguments(values, val_id, tabs, f)
        self.emit_annotations(tabs, f)

        values[self] = "t" + str(val_id)
        f.write(tabs + "auto " + values[self] + " = " + values[self.val] +
                "->type()->expect<c10::TensorType>();\n")
        return val_id + 1


# Helper(op, args, method, expects_node=False, needs_graph=False)
#
# A wrapper class for helper methods that return tensors
# Parameters:
#   op - operator
#   args - arguments
#   method - generation method
#   expects_node - True if arguments should be typed Node* instead of Value*
#   needs_graph - method takes pointer to graph object
class Helper(Value):
    def __init__(self,
                 op,
                 args,
                 method,
                 expects_node=False,
                 needs_graph=False,
                 const=False,
                 ptr_or_ref=None):
        super().__init__(op, args, const, ptr_or_ref)
        self.method = method
        self.expects_node = expects_node
        self.needs_graph = needs_graph

    def emit(self, values, val_id, tabs, f=sys.stdout, root=False):
        if self in values:
            return val_id

        val_id = self.emit_arguments(values, val_id, tabs, f)
        self.emit_annotations(tabs, f)

        args = [values[arg] for arg in self.args]
        if self.expects_node:
            args = [arg + "->node()" for arg in args]

        if self.needs_graph:
            args = ["graph"] + args

        val_id = self.emit_assign_return(values, val_id, root, tabs, f,
                                         self.const, self.ptr_or_ref)
        self.emit_call(self.method, args, ";\n", f)
        return val_id


# InputValue(name, num)
#
# Represents an input to an operator
# Parameters:
#   name - name of input
#   num - index of input
class InputValue(Value):
    def __init__(self, name, num):
        Value.__init__(self, 'input', [])
        self.name = name
        self.num = num

    def emit(self, values, val_id, tabs, f=sys.stdout, root=False):
        assert not root, "input values cannot be root expression"
        if self in values:
            return val_id

        self.emit_assign_return(values,
                                self.name,
                                root,
                                tabs,
                                f,
                                ptr_or_ref=PtrOrRef.PTR)
        f.write("node->input(" + str(self.num) + ");\n")
        return val_id

    def vn(self):
        return self.name

    def same(self, other):
        return True

    def render(self):
        return self.name


# OutputValue(index)
#
# Represents the output value of an operator. This is useful for
# occasions where we need the expected shape of the output.
# Parameters:
#   index - index of output
class OutputValue(Value):
    def __init__(self, index):
        Value.__init__(self, 'output' + str(index), [])
        self.index = index

    def emit(self, values, val_id, tabs, f=sys.stdout, root=False):
        assert not root, "output values may not be root expressions"
        if self in values:
            return val_id

        val_id = self.emit_assign_return(values,
                                         val_id,
                                         root,
                                         tabs,
                                         f,
                                         ptr_or_ref=PtrOrRef.PTR)
        f.write("node->output(" + str(self.index) + ");\n")
        return val_id + 1

    def render(self):
        return self.op


# NonTensorConstant(op, val, method)
#
# Represents a constant value that is not a tensor. Supports literals
# as well as graph constants.
# Parameters:
#   val - the constant value
#   method - helper method to be called when the value is not a literal
class NonTensorConstant(NonTensorValue):
    def __init__(self, op, val, method):
        self.val = val
        self.method = method
        if isinstance(val, Value):
            NonTensorValue.__init__(self, op, [val])
        else:
            NonTensorValue.__init__(self, op, [])

        if isinstance(self.val, str):
            self.val = '"' + self.val + '"'

    def emit(self, values, val_id, tabs, f=sys.stdout, root=False):
        assert not root, op + " cannot be a root expression"
        if not isinstance(self.val, Value):
            values[self] = str(self.val)
            return val_id
        if self in values:
            return val_id

        val_id = self.emit_arguments(values, val_id, tabs, f)
        self.emit_annotations(tabs, f)

        val_id = self.emit_assign_return(values, val_id, root, tabs, f)
        self.emit_call(self.method, [values[self.val] + "->node()"], ";\n", f)
        return val_id

    def vn(self):
        if isinstance(self.val, Value):
            return Value.vn(self)
        return str(self.val)

    def render(self):
        if isinstance(self.val, Value):
            return self.op + "(" + self.val.render() + ")"
        return str(self.val)

    def same(self, other):
        if self.op != other.op or len(self.args) != len(other.args):
            return False
        if isinstance(self.val, Value):
            return self.val.same(other.val)
        return self.render() == other.render()


# NonTensorHelper(op, args, method, expects_node)
#
# A wrapper class for helper methods that do not return tensors
# Parameters:
#   op - operator
#   args - arguments
#   method - generation method
#   expects_node - True if arguments should be typed Node* instead of Value*
class NonTensorHelper(NonTensorValue):
    def __init__(self, op, args, method, expects_node=False,
                 needs_graph=False):
        NonTensorValue.__init__(self, op, args)
        self.method = method
        self.expects_node = expects_node
        self.needs_graph = needs_graph

    def emit(self, values, val_id, tabs, f=sys.stdout, root=False):
        assert not root, op + " helper cannot be a root expression"
        return Helper.emit(self, values, val_id, tabs, f, False)


# EmptyInitializer()
#
# Helper class that produces an empty initializer list
class EmptyInitializer(NonTensorValue):
    def __init__(self):
        NonTensorValue.__init__(self, "empty_initializer", [])

    def emit(self, values, val_id, tabs, f=sys.stdout, root=False):
        values[self] = self.render()
        return val_id

    def vn(self):
        return self.render()

    def render(self):
        return "{}"


class OriginalNode(Value):
    def __init__(self):
        Value.__init__(self, 'input', [])
        self.name = "original_node"

    def emit(self, values, val_id, tabs, f=sys.stdout, root=False):
        if self in values:
            return val_id

        self.emit_assign_return(values,
                                self.name,
                                root,
                                tabs,
                                f,
                                ptr_or_ref=PtrOrRef.PTR)
        f.write("node;\n")
        return val_id

    def vn(self):
        return self.name

    def same(self, other):
        return True

    def render(self):
        return self.name


================================================
FILE: scripts/set_version.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import argparse
import logging
import os

from utils import _utils

logger = logging.getLogger(os.path.basename(__file__))
_utils.set_logger(logger)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--debug",
                        "-d",
                        action="store_true",
                        help="Print debug messages")
    parser.add_argument("--torch-version", type=str)
    parser.add_argument("--input-file", type=str)
    parser.add_argument("output", help="File to create")

    args = parser.parse_args()

    logging_level = logging.DEBUG if args.debug else logging.INFO
    logging.basicConfig(level=logging_level)
    logger.debug("Args: %s", str(args))

    pkg_info = _utils.PkgInfo.load_from_file(must_exist=False)

    # Copy the content of python/__init__.py and replace the occurrences of
    # @VERSION@ / @SNAPSHOT@ with the actual version / snapshot
    with open(args.output, "w") as f:
        if args.input_file is None:
            args.input_file = os.path.join(_utils.sources_dir(), "python",
                                           "__init__.py")
        for line in open(args.input_file):
            line = line.replace("@VERSION@", pkg_info.version_long)
            line = line.replace("@SNAPSHOT@", pkg_info.snapshot)
            if args.torch_version:
                line = line.replace("@TORCH_VERSION@", args.torch_version)
            f.write(line)


================================================
FILE: scripts/utils/_utils.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import re
import enum
import fcntl
import json
import os
import shutil
import logging
import platform
import subprocess

logger = logging.getLogger(__name__)


def get_nprocs():
    return len(os.sched_getaffinity(0))


# Make the _utils functions log using the caller's logger instead of the
# default 'utils/_utils.py'
def set_logger(new_logger):
    global logger
    logger = new_logger


def rmdir_if_exists(directory):
    if os.path.isdir(directory):
        shutil.rmtree(directory)


def rm_if_exists(filename):
    if os.path.isfile(filename):
        os.remove(filename)


def get_first_line(filename):
    return open(filename, "r").readline().rstrip()


def sources_dir():
    # ./scripts/utils/../../:
    return os.path.dirname(
        os.path.dirname(os.path.dirname(os.path.realpath(__file__))))


class OsType(enum.Enum):
    Osx = "osx"
    Linux = "linux"
    Unknown = "unknown"


def get_required_torch_version():
    for line in open(os.path.join(sources_dir(), "CMakeLists.txt"), "r"):
        m = re.match(r"set\(TORCH_VERSION +([0-9.]+)\)", line)
        if m:
            return m.group(1)
    raise RuntimeError("Couldn't find TORCH_VERSION in CMakeLists.txt")


class PkgInfo:
    _pkg_info_file = "pkg_info.json"

    def __init__(self,
                 version=None,
                 snapshot=None,
                 os_type=None,
                 package_os_type=None,
                 build_number=None,
                 doc_name=None,
                 project_name=None,
                 **kwargs):
        logger.debug(
            "PkgInfo: user provided version=%s snapshot=%s os_type=%s"
            " package_os_type=%s build_number=%s doc_name=%s "
            "project_name=%s", version, snapshot, os_type, package_os_type,
            build_number, doc_name, project_name)
        self.version = version or _get_version()
        self.snapshot = snapshot or _get_snapshot()
        self.os_type = os_type or get_os_type()
        if isinstance(self.os_type, OsType):
            self.os_type = self.os_type.value
        self.package_os_type = package_os_type or _get_package_os_type()
        self.doc_name = doc_name or "poptorch-user-guide"
        self.poptorch_geometric_doc_name = "poptorch-geometric-user-guide"
        self.project_name = project_name or "poptorch"
        self.version_long = self.version
        self.poptorch_hash = _get_poptorch_hash()
        if build_number:
            self.version_long += "+" + build_number
        logger.debug("Adding custom attributes: %s", kwargs)
        self.__dict__.update(kwargs)
        logger.info("PkgInfo initialised: %s", str(self.__dict__))

    def pdf_filename(self, poptorch_geometric=False):
        doc_name = self.poptorch_geometric_doc_name if poptorch_geometric \
            else self.doc_name
        return f"{doc_name}-{self.version}-{self.snapshot}.pdf"

    def html_filename(self, poptorch_geometric=False):
        doc_name = self.poptorch_geometric_doc_name if poptorch_geometric \
            else self.doc_name
        return f"{doc_name}-html-{self.version}-{self.snapshot}.zip"

    def prodinfo_filename(self):
        return f"{self.project_name}-{self.version}-{self.snapshot}.yml"

    def save_to_file(self):
        with open(PkgInfo._pkg_info_file, "w") as f:
            json.dump(self.__dict__, f)

    @staticmethod
    def load_from_file(must_exist=False, path="."):
        pkg_info_path = os.path.join(path, PkgInfo._pkg_info_file)
        if not os.path.exists(pkg_info_path):
            if not must_exist:
                logger.info("Using default PkgInfo() options")
                return PkgInfo()
            raise FileNotFoundError(f"{pkg_info_path} not found")
        logger.info("Loading packaging options from %s", pkg_info_path)
        with open(pkg_info_path, "r") as f:
            attrs = json.load(f)
            return PkgInfo(**attrs)


def _get_version():
    v = json.load(open(os.path.join(sources_dir(), "version.json")))
    return f"{v['major']}.{v['minor']}.{v['point']}"


def _get_view_hash():
    try:
        hash = subprocess.check_output(
            [
                "git", "--git-dir",
                os.path.join(os.path.dirname(sources_dir()), ".git"),
                "rev-parse", "--short=10", "HEAD"
            ],
            stderr=subprocess.STDOUT).decode("utf-8").strip().rstrip()
        return hash
    except (subprocess.CalledProcessError, FileNotFoundError):
        return None


def _get_poptorch_hash():
    try:
        hash = subprocess.check_output(
            [
                "git", "--git-dir",
                os.path.join(sources_dir(), ".git"), "rev-parse", "--short=10",
                "HEAD"
            ],
            stderr=subprocess.STDOUT).decode("utf-8").strip().rstrip()
        return hash
    except (subprocess.CalledProcessError, FileNotFoundError):
        return None


def _get_snapshot():
    """ Use the view hash if available.
    Use the PopTorch hash as a fallback.
    Use 0000000000 if no git repository is found
    """
    snapshot = _get_view_hash()
    if snapshot:
        logger.debug("Using View hash %s as snapshot", snapshot)
        return snapshot
    snapshot = _get_poptorch_hash()
    if snapshot:
        logger.debug("Using PopTorch hash %s as snapshot", snapshot)
        return snapshot
    logger.debug("No git hash found to use as snapshot")
    return "0000000000"


def _get_package_os_type():
    distrib = None
    version = None
    try:
        for line in open("/etc/os-release", "r"):
            if line.startswith("ID="):
                distrib = line.split("=")[1].rstrip()
                distrib = distrib.replace('"', "")
            elif line.startswith("VERSION_ID="):
                version = line.split("=")[1]
                version = version.replace(".", "_")
                version = version.replace('"', "").rstrip()
    except FileNotFoundError as exc:
        logger.warning(f"Setting distro/version to \"unknown\" because: {exc}")
        distrib = "unknown"
        version = "unknown"
    assert distrib and version
    return f"{distrib}_{version}"


def get_arch_type():
    arch = platform.machine()
    assert arch in ["aarch64", "x86_64"]
    return arch


def get_os_type():
    p = platform.uname()
    if p.system == "Darwin":
        return OsType.Osx
    if p.system == "Linux":
        return OsType.Linux

    return OsType.Unknown


def _make_output_non_blocking(output):
    fd = output.fileno()
    fl = fcntl.fcntl(fd, fcntl.F_GETFL)
    fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK)
    return output


class _LinesProcessor:
    def __init__(self, printer_fn):
        self.printer_fn = printer_fn
        self.partial_line = ""

    def _is_full_line(self, line):
        return line[-1] == "\n"

    def process(self, lines, flush=False):
        """ Due to buffering we need to check if lines
        are actual lines or just fragment of lines (in which case we
        wait until we've got the whole line available to print it).
        """
        if lines is None:
            lines = ""
        else:
            lines = lines.decode("utf-8")
        lines = lines.split("\n")
        lines[0] = self.partial_line + lines[0]
        self.partial_line = lines[-1]
        for line in lines[:-1]:
            self.printer_fn(line)
        if flush and self.partial_line:
            self.printer_fn(self.partial_line)
            self.partial_line = ""


class Process:
    def __init__(
            self,
            cmd,  # NB as shell=True, shlex.quote is needed for filenames
            env=None,
            redirect_stderr=False,
            stdout_handler=None,
            stderr_handler=None,
            bufsize=-1):
        if redirect_stderr:
            assert stderr_handler is None, ("You can't have a stderr handler "
                                            "when it's redirected to stdout")
            stderr = subprocess.STDOUT
        else:
            stderr = subprocess.PIPE

        self.p = subprocess.Popen(cmd,
                                  shell=True,
                                  env=env,
                                  executable='/bin/bash',
                                  stdin=subprocess.PIPE,
                                  stdout=subprocess.PIPE,
                                  stderr=stderr,
                                  bufsize=bufsize)
        _make_output_non_blocking(self.p.stdout)
        self.stdout = _LinesProcessor(stdout_handler or logger.info)
        self.stderr = None
        self.is_alive = True
        self._returncode = None
        if not redirect_stderr:
            _make_output_non_blocking(self.p.stderr)
            self.stderr = _LinesProcessor(stderr_handler or logger.error)

    def _read(self):
        # If it's the last time _read is called (i.e is_alive is now False)
        # then flush the pipes and close them
        if self.stderr:
            self.stderr.process(self.p.stderr.read(), not self.is_alive)
            if not self.is_alive:
                self.p.stderr.close()
        self.stdout.process(self.p.stdout.read(), not self.is_alive)
        if not self.is_alive:
            self.p.stdout.close()
            self._returncode = self.p.returncode
            del self.p

    def eof(self):
        self.p.stdin.close()

    def is_running(self):
        if not self.is_alive:
            return self.is_alive

        self.is_alive = self.p.poll() is None
        # We need to read the outputs to avoid
        # the process to hang if the output gets too long
        self._read()
        return self.is_alive

    def wait(self):
        while self.is_running():
            pass
        return self._returncode

    def write(self, s):
        self.p.stdin.write(s)

    def returncode(self):
        return self._returncode


def run_commands(*commands,
                 env=None,
                 stop_on_error=True,
                 stdout_handler=None,
                 stderr_handler=None):
    bash_flags = ""
    if logger.isEnabledFor(logging.DEBUG):
        bash_flags += "x"  # print commands
    if stop_on_error:
        bash_flags += "e"

    if bash_flags:
        bash_flags = "set -" + bash_flags + ";"

    logger.debug("Running: %s", commands)
    c = Process([bash_flags + ";".join(commands)],
                env=env,
                stdout_handler=stdout_handler,
                stderr_handler=stderr_handler)
    returncode = c.wait()

    assert returncode == 0, (f"Shell commands {commands} failed with "
                             f"return code {returncode}")


================================================
FILE: setup.cfg
================================================
[metadata]
license_files =
   License.txt
   poptorch_third_party_licenses.txt


================================================
FILE: setup.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import pathlib
import os
import sys
import logging
from setuptools import setup
from setuptools.dist import Distribution
from pybind11.setup_helpers import Pybind11Extension

logging.basicConfig(level=logging.INFO)

# torch{audio, vision} are added here to prevent the torch upgrade when other
# packages depend on torch{audio, vision}.
REQUIRES = [
    'tqdm', '@TORCH_DEPENDENCY@', '@TORCHAUDIO_DEPENDENCY@',
    '@TORCHVISION_DEPENDENCY@'
]
VERSION = "@VERSION@"

LONG_DESCRIPTION = (
    "PopTorch is a set of extensions for PyTorch enabling "
    "models to be trained, evaluated and used on the Graphcore IPU.")

LIBS = ["*.so", "lib/*", "lib/poplar_rt/*", "lib/graphcore/lib/*.a"]


class BinaryDistribution(Distribution):
    """Distribution which always forces a binary package with platform name"""

    def has_ext_modules(self):
        return True


def get_torch_paths():
    # setup.py is executed several times, so it's ok if torch is not always
    # available.
    try:
        import torch  # pylint: disable=import-outside-toplevel
    except ModuleNotFoundError:
        return [], []
    torch_root = str(pathlib.Path(torch.__file__).parent)
    return [
        os.path.join(torch_root, "include"),
        os.path.join(torch_root, "include", "torch", "csrc", "api", "include")
    ], [os.path.join(torch_root, "lib")]


torch_include_dirs, torch_lib_dirs = get_torch_paths()
package_data = {'poptorch': LIBS}

# Copy custom codelets into the package so that we can pre-compile them later.
package_data["poptorch"].append("*.inc.cpp")

core_mod = Pybind11Extension(
    "poptorch.poptorch_core", ["src/poptorch.cpp"],
    define_macros=[("_GLIBCXX_USE_CXX11_ABI", 0)],
    include_dirs=["include"] + torch_include_dirs,
    library_dirs=["poptorch/lib"] + torch_lib_dirs,
    extra_link_args=["-Wl,--rpath=$ORIGIN/lib:$ORIGIN"],
    libraries=[
        "poptorch", "popart_compiler", "poptorch_err", "poptorch_logging",
        "torch_python", "torch"
    ],
    language="c++",
    cxx_std="17")

# Same as pybind11_add_module but without stripping the symbols and setting the visibility to hidden.
# Source: https://pybind11.readthedocs.io/en/stable/compiling.html#advanced-interface-library-targets
#
# If the symbols are stripped then error messages will only contain symbol
# addresses instead of human readable names.
core_mod.extra_compile_args = [
    f for f in core_mod.extra_compile_args
    if not "visibility=hidden" in f and not "-g0" in f
]

setup(
    name='poptorch',
    version=VERSION,
    description=LONG_DESCRIPTION,
    long_description=LONG_DESCRIPTION,
    long_description_content_type="text/markdown",
    url='http://graphcore.ai',
    author='Graphcore',
    author_email='contact@graphcore.ai',
    ext_modules=[core_mod],
    has_ext_modules=lambda: True,
    license='MIT License',
    license_files=('License.txt', 'poptorch_third_party_licenses.txt'),
    packages=['poptorch'],
    package_data=package_data,
    include_package_data=True,
    python_requires=f"=={sys.version_info.major}.{sys.version_info.minor}.*",
    platforms="@PLATFORM@",
    install_requires=REQUIRES,
    zip_safe=False,
    distclass=BinaryDistribution,
    classifiers=[
        'Intended Audience :: Developers',
        'Intended Audience :: Education',
        'Intended Audience :: Science/Research',
        'License :: OSI Approved :: MIT License',
        'Programming Language :: Python :: 3',
        'Topic :: Scientific/Engineering',
        'Topic :: Scientific/Engineering :: Mathematics',
        'Topic :: Scientific/Engineering :: Artificial Intelligence',
        'Topic :: Software Development',
        'Topic :: Software Development :: Libraries',
        'Topic :: Software Development :: Libraries :: Python Modules',
    ],
)


================================================
FILE: tests/.gitignore
================================================
.datasets


================================================
FILE: tests/CMakeLists.txt
================================================
add_subdirectory(custom_ops)
add_subdirectory(cpp)

# Copy tests to the build folder if requested.
if(COPY_TESTS)
  # NOTE: Collapsing the hierarchy like this may cause conflicts.
  file(GLOB_RECURSE TEST_FILES "${CMAKE_CURRENT_SOURCE_DIR}/*.py")
  install(FILES ${TEST_FILES} DESTINATION "${CMAKE_CURRENT_BINARY_DIR}")
  set(TESTS_PATH "${CMAKE_CURRENT_BINARY_DIR}")
else()
  set(TESTS_PATH "${CMAKE_CURRENT_SOURCE_DIR}")
endif()

set(EXTERNAL_DATASETS_DIR "${CMAKE_BINARY_DIR}/buildenv/external_datasets")

# Generate tests.
run_poptorch_install_command(
  "python3 ${CMAKE_CURRENT_SOURCE_DIR}/generate_test_file.py \
           ${TESTS_PATH} \
           ${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.cmake \
           --add-to-sys-path ${CMAKE_INSTALL_PREFIX} \
           --external-datasets-dir ${EXTERNAL_DATASETS_DIR} \
           --extra-pytest-args=\"${EXTRA_PYTEST_ARGS}\" "
  "${PROJECT_BINARY_DIR}"
  "generate_test_file.py")


================================================
FILE: tests/activations_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import pytest
import torch
import torch.nn as nn
import helpers
import poptorch

# pylint: enable=wrong-import-order

# Non-linear activations (Weighted activations)
# 'torch.nn.ELU', 'torch.nn.Hardshrink', 'torch.nn.Hardtanh', 'torch.nn.LeakyReLU', 'torch.nn.LogSigmoid', 'torch.nn.MultiheadAttention', 'torch.nn.MultiheadAttention.forward',
# 'torch.nn.PReLU', 'torch.nn.ReLU', 'torch.nn.ReLU6', 'torch.nn.RReLU', 'torch.nn.SELU', 'torch.nn.SiLU', 'torch.nn.CELU', 'torch.nn.GELU', 'torch.nn.Sigmoid', 'torch.nn.Softplus',
# 'torch.nn.Softshrink', 'torch.nn.Softsign', 'torch.nn.Tanh', 'torch.nn.Tanhshrink', 'torch.nn.Threshold',

# Non-linear activations (other)
# 'torch.nn.Softmin', 'torch.nn.Softmax', 'torch.nn.Softmax2d', 'torch.nn.LogSoftmax', 'torch.nn.AdaptiveLogSoftmaxWithLoss', 'torch.nn.AdaptiveLogSoftmaxWithLoss.log_prob',
# 'torch.nn.AdaptiveLogSoftmaxWithLoss.predict',


# A version of Softplus with non default arguments
class SoftplusWithParams(nn.Softplus):
    def __init__(self):
        super().__init__(beta=5.0, threshold=4.0)


activation_functions = [
    nn.ReLU, nn.Tanh, nn.Sigmoid, nn.SELU, nn.SiLU, nn.ELU, nn.Softmax,
    nn.LogSoftmax, nn.Softsign, nn.LeakyReLU, nn.Hardtanh, nn.Softplus,
    nn.Softshrink, nn.Hardshrink, nn.CELU, nn.Hardsigmoid, nn.Hardswish,
    SoftplusWithParams
]


@pytest.mark.parametrize("op", activation_functions)
def test_activations(op):
    torch.manual_seed(42)

    input = torch.randn([2, 20])

    fn = op(dim=1) if op in (nn.Softmax, nn.LogSoftmax) else op()

    model = helpers.ModelWithWeights(fn, input.shape)
    model.train()

    # Run on CPU.
    native_out, _ = model((input, ))

    # Run on IPU.
    poptorch_model = poptorch.trainingModel(model)
    poptorch_out, _ = poptorch_model((input, ))

    # Inference test - check outputs
    helpers.assert_allclose(actual=poptorch_out,
                            expected=native_out,
                            rtol=1e-4,
                            atol=1e-7,
                            equal_nan=True)

    poptorch_model.assert_weights_changed()


@pytest.mark.parametrize("approximate", ["tanh", "none"])
def test_gelu(approximate):
    if approximate == "none":
        pytest.skip("TODO: Implement efficient GELU_ERF")
    torch.manual_seed(42)

    input = torch.randn((2, 20))
    op = nn.GELU(approximate=approximate)

    model = helpers.ModelWithWeights(op, input.shape)
    model.train()

    # Run on CPU.
    native_out, _ = model((input, ))

    # Run on IPU.
    poptorch_model = poptorch.trainingModel(model)
    poptorch_out, _ = poptorch_model((input, ))

    helpers.assert_allclose(actual=poptorch_out, expected=native_out)

    poptorch_model.assert_weights_changed()


@pytest.mark.parametrize("input", [
    torch.randn((4, )),
    torch.randn((2, 2)),
    torch.randn((2, 8, 16)),
    torch.randn((2, 8, 32, 32))
])
def test_prelu(input):
    num_channels = input.shape[1] if input.dim() >= 2 else 1
    model = nn.PReLU(num_channels)

    # Run on CPU.
    native_out = model(input)

    # Run on IPU.
    poptorch_model = poptorch.inferenceModel(model)
    poptorch_out = poptorch_model(input)

    helpers.assert_allclose(actual=poptorch_out,
                            expected=native_out,
                            rtol=1e-4,
                            atol=1e-7,
                            equal_nan=True)


@pytest.mark.parametrize("dim", range(5))
def test_glu(dim):
    torch.manual_seed(42)
    N, C, M, K, L = 2, 4, 6, 8, 10

    input = torch.randn(N, C, M, K, L)
    model = helpers.ModelWithWeights(nn.GLU(dim=dim), input.shape)

    # Run on CPU.
    native_out, _ = model((input, ))

    # Run on IPU.
    poptorch_model = poptorch.trainingModel(model)
    poptorch_out, _ = poptorch_model((input, ))

    # Inference test - check outputs
    helpers.assert_allclose(expected=native_out, actual=poptorch_out)

    # Training test - check weights have changed
    poptorch_model.assert_weights_changed()


@pytest.mark.parametrize("op", activation_functions)
def test_activation_numerics(op):
    enable_exceptions = True
    if op in (nn.SELU, nn.ELU, nn.CELU):
        # These activations rely on exponentials that will overflow
        # but saturate to a linear function in the range where x >> 0
        enable_exceptions = False

    model = op(dim=1) if op in (nn.Softmax, nn.LogSoftmax) else op()
    x = torch.FloatTensor([[10., 100., 1000.]])
    native_out = model(x)

    options = poptorch.Options()
    options.Precision.enableFloatingPointExceptions(enable_exceptions)
    poptorch_model = poptorch.inferenceModel(model, options=options)
    poptorch_out = poptorch_model(x)

    helpers.assert_allclose(actual=poptorch_out, expected=native_out)


@pytest.mark.ipuHardwareRequired
@pytest.mark.filterwarnings("ignore:Trace had nondeterministic nodes")
@pytest.mark.filterwarnings("ignore:Output nr 1. of the traced function")
@pytest.mark.filterwarnings("ignore:Output nr 2. of the traced function")
def test_rrelu_training():
    opts = poptorch.Options().randomSeed(0)

    input = torch.randn([3000])

    model = helpers.ModelWithWeights(nn.RReLU(), input.shape)

    # in training negative inputs are multiplied by a random parameter
    # we'll check positive outputs and distribution of negative outputs
    native_out, _ = model((input, ))
    poptorch_model = poptorch.trainingModel(model, options=opts)
    poptorch_out, _ = poptorch_model((input, ))

    ref = native_out[native_out >= 0]
    out = poptorch_out[poptorch_out >= 0]
    helpers.assert_allclose(actual=out, expected=ref)

    ref = native_out[native_out < 0]
    out = poptorch_out[poptorch_out < 0]
    # Inference test - check outputs
    for stat in [torch.mean, torch.var]:
        helpers.assert_allclose(actual=stat(out),
                                expected=stat(ref),
                                atol=0.1,
                                rtol=0.1)

    # Training test - check weights have changed
    poptorch_model.assert_weights_changed()


def test_rrelu_inference():
    torch.manual_seed(42)
    input = torch.randn([200])

    model = nn.RReLU()

    # in inference there is no randomness - check results directly
    model.eval()
    native_out = model(input)
    poptorch_model = poptorch.inferenceModel(model)
    poptorch_out = poptorch_model(input)
    helpers.assert_allclose(actual=poptorch_out, expected=native_out)


================================================
FILE: tests/attach_detach_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.

import re
import math
import unittest.mock
import pytest
import torch
import helpers
import poptorch


@unittest.mock.patch.dict("os.environ", {"POPTORCH_WAIT_FOR_IPU": "0"})
@pytest.mark.ipuHardwareRequired
def test_attach_detach():
    torch.manual_seed(42)

    target = torch.randint(0, 10, [1])
    target = target.expand([10])
    input = torch.randn(10, 10)

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.linear = torch.nn.Linear(10, 10)
            self.loss = torch.nn.CrossEntropyLoss()

        def forward(self, data, target=None):
            out = self.linear(data)

            if target is None:
                return out

            loss = self.loss(out, target)
            return out, loss

    model = Model()

    opts = poptorch.Options()
    # Ensure that both models use the same IPU
    opts.useIpuId(1)
    training = poptorch.trainingModel(model, options=opts)

    opts = opts.clone()
    inference = poptorch.inferenceModel(model, options=opts)

    _, initial_loss = training(input, target)

    if math.isnan(initial_loss):
        raise ValueError("original_loss is NaN")

    if poptorch.ipuHardwareIsAvailable():
        with pytest.raises(poptorch.Error) as excinfo:
            inference.compile(torch.randn(10))
            assert excinfo.match("Failed to acquire")

    training.detachFromDevice()
    # Ensure that this breaks

    error_msg = r"Device is not attached"
    with pytest.raises(poptorch.Error, match=error_msg):
        training.detachFromDevice()

    inference.compile(torch.randn(10))

    if poptorch.ipuHardwareIsAvailable():
        inference.detachFromDevice()

    assert initial_loss > 0.1

    loss = float('nan')

    for _ in range(0, 2):
        _, loss = training(input, target)
        # Each batch should NOT report its own loss. As by default training
        # model should have a "Final" output mode.
        assert len(loss.size()) == 0

        if math.isnan(loss):
            raise ValueError("loss is NaN")

        training.detachFromDevice()

        inference(torch.randn(10))
        inference.detachFromDevice()


@pytest.mark.ipuHardwareRequired
def test_attach_detach_accuracy():
    class TrainingModelWithLoss(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.model = torch.nn.Linear(1, 2)
            self.loss = torch.nn.CrossEntropyLoss()

        def forward(self, args, loss_inputs=None):
            output = self.model(args)
            if loss_inputs is None:
                return output
            final_loss = self.loss(output, loss_inputs)
            return output, final_loss

    torch.manual_seed(42)

    input_data = torch.Tensor([[1.], [-1.]])
    labels_data = torch.Tensor([0, 1]).long()
    model_with_loss = TrainingModelWithLoss()
    optimizer = poptorch.optim.SGD(model_with_loss.parameters(),
                                   lr=0.1,
                                   use_combined_accum=False)
    training_model = poptorch.trainingModel(model_with_loss,
                                            optimizer=optimizer)
    inference_model = poptorch.inferenceModel(model_with_loss)

    losses1 = []
    for _ in range(5):
        _, loss = training_model(input_data, labels_data)
        print("Loss:", loss)
        losses1.append(loss)

    training_model.detachFromDevice()
    inference1 = inference_model(input_data)
    print("Predictions:", inference1)
    inference_model.detachFromDevice()

    losses2 = []
    for _ in range(100):
        _, loss = training_model(input_data, labels_data)
        print(loss)
        losses2.append(loss)

    training_model.detachFromDevice()
    inference2 = inference_model(input_data)
    print("Predictions:", inference2)

    assert not torch.allclose(inference1, inference2)
    assert not torch.allclose(inference2, torch.zeros(2, 2))
    assert losses1[-1] > losses2[-1]
    for i in range(len(losses2) - 1):
        assert losses2[i] != losses2[i + 1]
    assert losses2[-1] < 0.1


@pytest.mark.ipuHardwareRequired
@unittest.mock.patch.dict("os.environ", {"POPTORCH_WAIT_FOR_IPU": "0"})
@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("TRACE")
def test_on_demand_attach(capfd):
    model = torch.nn.Linear(1, 2)

    opts = poptorch.Options()
    opts.connectionType(poptorch.ConnectionType.OnDemand)

    m = poptorch.inferenceModel(model, opts)

    input = torch.Tensor([[1.], [-1.]])
    m(input)
    log = helpers.LogChecker(capfd).createIterator()
    # We acquire device 0 to compile. (It's the first device with a matching target)
    log.findNext(re.escape("Acquired 1 IPU(s): running on device Id 0"))
    # Make sure we compile before we attach to the device.
    log.findNext("Finished Poplar compilation")
    # Device 0 is still free so we'll attach to it.
    log.findNext("Attached to device 0")

    n = poptorch.inferenceModel(model, opts)
    n(input)
    log = helpers.LogChecker(capfd).createIterator()
    # We acquire device 0 to compile. (It's the first device with a matching target)
    # Note: acquiring doesn't mean attaching, it's ok if the device is not actually free.
    log.findNext(re.escape("Acquired 1 IPU(s): running on device Id 0"))
    # Make sure we compile before we attach to the device.
    log.findNext("Finished Poplar compilation")
    # Device 0 is in use by model 'm' so we should automatically get device 1.
    log.findNext("Attached to device 1")

    opts_always = opts.clone()
    opts_always.connectionType(poptorch.ConnectionType.Always)
    o = poptorch.inferenceModel(model, opts_always)
    o(input)
    log = helpers.LogChecker(capfd).createIterator()
    # In Always mode we find a free IPU before the compilation and attach to it immediately.
    log.findNext(re.escape("Acquired 1 IPU(s): running on device Id 2"))
    # Devices 0 & 1 are in use so we'll get device 2.
    log.findNext("Attached to device 2")
    log.findNext("Finished Poplar compilation")


@pytest.mark.ipuHardwareRequired
def test_attach_detach_tied_weights():
    torch.manual_seed(42)

    input = torch.randn(10, 10)

    class Model(torch.nn.Module):
        def __init__(self, inp=10, out=100):
            super().__init__()
            self.encoder = torch.nn.Linear(inp, out, bias=False)
            self.encoder_tied = torch.nn.Linear(inp, out, bias=False)
            self.encoder_tied.weight = self.encoder.weight

        def forward(self, data):
            out = self.encoder(data) + self.encoder_tied(data)
            out = torch.nn.functional.linear(out, self.encoder.weight.t())
            if self.training:
                return out, poptorch.identity_loss(out, reduction="mean")
            return out

    model = Model()
    model.train()

    optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
    training_opts = poptorch.Options()
    training_model = poptorch.trainingModel(model,
                                            options=training_opts,
                                            optimizer=optimizer)
    training_model.compile(input)
    training_model.detachFromDevice()

    model.eval()
    inference_opts = poptorch.Options()
    inference_model = poptorch.inferenceModel(model, options=inference_opts)
    inference_model.compile(input)

    for _ in range(5):
        inference_model.detachFromDevice()
        training_model.attachToDevice()
        training_model.detachFromDevice()
        inference_model.attachToDevice()


================================================
FILE: tests/attach_detach_wait_for_ipu_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.

import math
import os
import time
import unittest.mock
import pytest
import torch
import torch.multiprocessing as mp
import helpers
import poptorch


def inference_process(event):
    assert os.environ.get('POPTORCH_WAIT_FOR_IPU') is not None

    torch.manual_seed(42)

    target = torch.randint(0, 10, [1])
    target = target.expand([10])

    model = torch.nn.Linear(10, 10)

    opts = poptorch.Options()
    # Ensure that both models use the same IPU
    opts.useIpuId(1)

    inference = poptorch.inferenceModel(model, options=opts)
    inference.compile(torch.randn(10))
    event.set()
    time.sleep(12)
    inference.detachFromDevice()


@helpers.printCapfdOnExit
@unittest.mock.patch.dict("os.environ", {"POPTORCH_WAIT_FOR_IPU": "1"})
@pytest.mark.ipuHardwareRequired
@helpers.overridePoptorchLogLevel("TRACE")
def test_attach_detach_wait_for_ipu(capfd):
    torch.manual_seed(42)

    target = torch.randint(0, 10, [1])
    target = target.expand([10])
    input = torch.randn(10, 10)

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.linear = torch.nn.Linear(10, 10)
            self.loss = torch.nn.CrossEntropyLoss()

        def forward(self, data, target):
            out = self.linear(data)
            loss = self.loss(out, target)
            return out, loss

    model = Model()

    opts = poptorch.Options()
    # Ensure that both models use the same IPU
    opts.useIpuId(1)

    poptorch_model = poptorch.trainingModel(model, options=opts)

    ctx = mp.get_context('spawn')
    mgr = mp.Manager()
    event = mgr.Event()
    process = ctx.Process(target=inference_process, args=(event, ))

    process.start()
    event.wait()
    _, initial_loss = poptorch_model(input, target)
    process.join()

    if math.isnan(initial_loss):
        raise ValueError("original_loss is NaN")

    poptorch_model.detachFromDevice()
    log = helpers.LogChecker(capfd)
    log.assert_contains("No IPU available, sleeping")


================================================
FILE: tests/batching_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import torch
import pytest
import helpers
import poptorch


def test_inferenceBatching():
    torch.manual_seed(42)

    model = torch.nn.Linear(6, 20)

    # Actually batched by 100.
    input = torch.randn([10, 1, 5, 6])

    # Run pytorch native on CPU batchsize 10.
    native_output = model(input)

    # Run on IPU batch size 1 * 10 popart batches.
    opts = poptorch.Options().deviceIterations(10)
    ipuModel = poptorch.inferenceModel(model, opts)
    poptorch_out = ipuModel(input)

    # Check that inference wrapper has defaulted to "All".
    assert len(poptorch_out.size()) == 4
    assert poptorch_out.size()[0] == 10
    helpers.assert_allclose(expected=native_output, actual=poptorch_out)


def test_trainingBatching():
    torch.manual_seed(4424242)

    # 10 Batches of 10.
    input = torch.randn(10, 10)

    # 10 batches of 1
    label = torch.randint(0, 10, [1])
    label = label.expand([10])

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.linear = torch.nn.Linear(10, 10)
            self.loss = torch.nn.CrossEntropyLoss()

        def forward(self, data, target):
            out = self.linear(data)
            loss = self.loss(out, target)
            return out, loss

    model = Model()

    # Run on IPU batch size 1 * 10 popart batches.
    opts = poptorch.Options().deviceIterations(10)
    poptorch_model = poptorch.trainingModel(model, options=opts)

    # Run all 10 batches as batchsize 10.
    out, _ = model(input, label)

    # Sanity check we weren't already matching the label.
    assert not torch.equal(torch.argmax(out, dim=1), label)

    for _ in range(0, 1000):
        _, loss = poptorch_model(input, label)

        # Each batch should NOT report its own loss. As by default training model should have a "Final" output mode.
        assert len(loss.size()) == 0

    # Run with trained weights.
    out, _ = model(input, label)

    # Check we are now equal with labels.
    helpers.assert_allequal(actual=torch.argmax(out, dim=1), expected=label)


@pytest.mark.parametrize("mode", list(poptorch.OutputMode))
def test_inferenceOutputModes(mode):
    torch.manual_seed(42)

    model = torch.nn.Linear(6, 20)

    # Actually batched by 100.
    input = torch.randn([10, 1, 5, 6])

    # Run pytorch native on CPU batchsize 10.
    native_out = model(input)

    # Run on IPU batch size 1 * 10 popart batches. output_return_period ignored if not EVERYN
    opts = poptorch.Options().deviceIterations(10)
    opts.outputMode(mode, output_return_period=5)
    ipuModel = poptorch.inferenceModel(model, opts)
    poptorch_out = ipuModel(input)

    if mode in [poptorch.OutputMode.All, poptorch.OutputMode.Default]:
        # Expect the full batch.
        assert len(poptorch_out.size()) == 4
        assert poptorch_out.size()[0] == 10
        helpers.assert_allclose(expected=native_out, actual=poptorch_out)
    elif mode == poptorch.OutputMode.EveryN:
        # Otherwise we are expecting device_iterations / N
        assert len(poptorch_out.size()) == 4
        assert poptorch_out.size()[0] == 2

        # Check each N is the correct batch
        helpers.assert_allclose(actual=poptorch_out[0], expected=native_out[4])
        helpers.assert_allclose(actual=poptorch_out[1], expected=native_out[9])

    else:
        # Otherwise we are expecting just one element per batch.
        assert len(poptorch_out.size()) == 4
        assert poptorch_out.size()[0] == 1

        if mode == poptorch.OutputMode.Final:
            # Check we are the same as the last output.
            helpers.assert_allclose(actual=poptorch_out.reshape(
                native_out[-1].shape),
                                    expected=native_out[-1])
        elif mode == poptorch.OutputMode.Sum:
            # Check we are close to the sum of the batch dim.
            sum = torch.sum(native_out, dim=0, keepdim=True)
            helpers.assert_allclose(actual=poptorch_out, expected=sum)
        else:
            assert False, "Unexpected output mode %s" % mode


@pytest.mark.parametrize("mode", list(poptorch.OutputMode))
def test_trainingOutputModes(mode):
    torch.manual_seed(42)

    # 1000 Batches of 10.
    input = torch.randn(1000, 10)

    # 1000 batches of 1
    label = torch.randint(0, 10, [1])
    label = label.expand([1000])

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.linear = torch.nn.Linear(10, 10)
            self.loss = torch.nn.CrossEntropyLoss()

        def forward(self, data, target):
            out = self.linear(data)
            loss = self.loss(out, target)
            return out, loss

    model = Model()

    # Run pytorch native on CPU batchsize 10.
    model(input, label)

    # Run on IPU batch size 1 * 1000 popart batches.
    opts = poptorch.Options().deviceIterations(1000)
    opts.outputMode(mode, output_return_period=20)

    poptorch_model = poptorch.trainingModel(model, options=opts)

    poptorch_out, loss = poptorch_model(input, label)

    if mode == poptorch.OutputMode.All:
        # Expect the full batch.
        assert len(poptorch_out.size()) == 2
        assert poptorch_out.size()[0] == 1000

        assert len(loss.size()) == 1
        assert loss.size()[0] == 1000

        # Check the rolling average loss is downward sloped.
        interval = 100
        previous_average = torch.mean(loss[:interval])
        for i in range(1, 1000 // interval):
            start = interval * i
            end = start + interval
            new_average = torch.mean(loss[start:end])

            assert new_average < previous_average

            previous_average = new_average

    elif mode == poptorch.OutputMode.EveryN:
        # Otherwise we are expecting device_iterations / N
        assert len(poptorch_out.size()) == 2
        assert poptorch_out.size()[0] == 50

        # There's too much noise in the losses for us to test directly without averaging like above so just test sizes.
        assert len(loss.size()) == 1
        assert loss.size()[0] == 50
    else:
        # Otherwise we are expecting just one element per batch.
        assert len(poptorch_out.size()) == 2
        assert poptorch_out.size()[0] == 1

        assert len(loss.size()) == 0

        if mode in [poptorch.OutputMode.Final, poptorch.OutputMode.Default]:
            # We just have to check the loss is small.
            # This is just relative to the previously observed loss values on this test with this seed.
            assert loss < 0.2

        elif mode == poptorch.OutputMode.Sum:
            # We just have to check that the loss is huge.
            assert loss > 500.0
        else:
            assert False, "Unexpected output mode %s" % mode


def run_gradient_accumulation_test(input, target, gradient_accumulations,
                                   accumulation_reduction_type, lr):
    torch.manual_seed(42)

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.linear = torch.nn.Linear(10, 10)
            self.loss = torch.nn.L1Loss(reduction="mean")

        def forward(self, data, target):
            out = self.linear(data)
            loss = self.loss(out, target)
            return out, loss

    model = Model()

    opts = poptorch.Options()
    opts.outputMode(poptorch.OutputMode.All)
    opts.Training.gradientAccumulation(gradient_accumulations)

    if accumulation_reduction_type is not None:
        opts.Training.accumulationAndReplicationReductionType(
            accumulation_reduction_type)

    poptorch_model = poptorch.trainingModel(model,
                                            options=opts,
                                            optimizer=torch.optim.SGD(
                                                model.parameters(), lr=lr))

    # Run 10 training steps
    for _ in range(10):
        poptorch_model(input, target)

    # return trained weight matrix
    return poptorch_model.linear.weight.data


def test_gradient_accumulation_training():
    torch.manual_seed(42)

    target = torch.randn(4, 10)
    input = torch.randn(4, 10)

    # Testing gradient accumulations 1 vs 2 and Mean reduction
    w_with_1 = run_gradient_accumulation_test(target, input, 1,
                                              poptorch.ReductionType.Mean,
                                              0.01)
    w_with_2 = run_gradient_accumulation_test(target, input, 2,
                                              poptorch.ReductionType.Mean,
                                              0.01)
    helpers.assert_allclose(actual=w_with_1, expected=w_with_2)

    # Test the default matches as well (i.e. the default is mean)
    w_with_2 = run_gradient_accumulation_test(target, input, 2, None, 0.01)
    helpers.assert_allclose(actual=w_with_1, expected=w_with_2)

    # Testing gradient accumulations 1 vs 2 and Sum reduction (different lr)
    w_with_1 = run_gradient_accumulation_test(target, input, 1,
                                              poptorch.ReductionType.Sum, 0.02)
    w_with_2 = run_gradient_accumulation_test(target, input, 2,
                                              poptorch.ReductionType.Sum, 0.01)
    helpers.assert_allclose(actual=w_with_1, expected=w_with_2)


class FourBlockModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.lin1 = torch.nn.Linear(1, 1)
        self.lin2 = torch.nn.Linear(1, 1)
        self.lin3 = torch.nn.Linear(1, 1)
        self.lin4 = torch.nn.Linear(1, 1)

    def forward(self, x):
        with poptorch.Block("B1", ipu_id=0):
            out = self.lin1(x)
        with poptorch.Block("B2", ipu_id=1):
            out = self.lin2(out)
        with poptorch.Block("B3", ipu_id=2):
            out = self.lin3(out)
        with poptorch.Block("B4", ipu_id=3):
            out = self.lin4(out)

        return out


class FourBlockModelNoScope(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.lin1 = torch.nn.Linear(1, 1)
        self.lin2 = torch.nn.Linear(1, 1)
        self.lin3 = torch.nn.Linear(1, 1)
        self.lin4 = torch.nn.Linear(1, 1)

    def forward(self, x):
        poptorch.Block.start("B1", ipu_id=0)
        out = self.lin1(x)
        poptorch.Block.start("B2", ipu_id=1)
        out = self.lin2(out)
        poptorch.Block.start("B3", ipu_id=2)
        out = self.lin3(out)
        poptorch.Block.start("B4", ipu_id=3)
        out = self.lin4(out)

        return out


@pytest.mark.parametrize("num_grad_accums", (4, 5, 7))
@pytest.mark.parametrize("device_iterations", (1, 2))
def test_gradient_accumulation_pipelined_training(num_grad_accums,
                                                  device_iterations):
    class TrainingFourBlockModel(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.four_block = FourBlockModel()

        def forward(self, x):
            out = self.four_block(x)
            with poptorch.Block("B4", ipu_id=3):
                loss = poptorch.identity_loss(out, reduction="mean")

            return out, loss

    model = TrainingFourBlockModel()
    opts = poptorch.Options()
    opts.deviceIterations(device_iterations)
    opts.Training.gradientAccumulation(num_grad_accums)

    poptorch_model = poptorch.trainingModel(model, options=opts)

    if num_grad_accums in (4, 5):
        err_msg = (r"poptorch\.Options\(\)\.Training\.gradientAccumulation "
                   r"must be greater than or equal to the number of pipeline"
                   r" stages \(7\) when using poptorch\.PipelinedExecution\. "
                   r"Please note that a model with 4 pipeline stages in "
                   r"PopTorch will have an additional 3 stages when training.")

        with pytest.raises(poptorch.poptorch_core.Error, match=err_msg):
            poptorch_model(torch.zeros(num_grad_accums * device_iterations))
    else:
        poptorch_model(torch.zeros(num_grad_accums * device_iterations))


@pytest.mark.parametrize("pipelined", [True, False])
@pytest.mark.parametrize("Model", [FourBlockModel, FourBlockModelNoScope])
def test_gradient_accumulation_inference(pipelined, Model):
    model = Model()
    opts = poptorch.Options()

    if pipelined:
        # pylint: disable=protected-access
        assert isinstance(opts._execution_strategy,
                          poptorch.PipelinedExecution)
    else:
        opts.setExecutionStrategy(poptorch.ShardedExecution())

    opts.Training.gradientAccumulation(2)

    err_msg = (r"You must set "
               r"poptorch\.Options\(\)\.Training\.gradientAccumulation to 1 "
               r"or leave it as its default value \(1\) when running a "
               r"poptorch\.inferenceModel\(\)\.")

    if pipelined:
        err_msg += (r" Use poptorch\.Options\(\)\.deviceIterations() to "
                    r"process a sufficient number of batches each run for "
                    r"pipelined execution instead.")

    with pytest.raises(poptorch.poptorch_core.Error, match=err_msg):
        poptorch.inferenceModel(model, options=opts)


@pytest.mark.parametrize("pipelined", [True, False])
@pytest.mark.parametrize("device_iterations", (2, 4))
@pytest.mark.parametrize("Model", [FourBlockModel, FourBlockModelNoScope])
def test_device_iterations_inference(pipelined, device_iterations, Model):
    model = Model()
    opts = poptorch.Options()

    if pipelined:
        # pylint: disable=protected-access
        assert isinstance(opts._execution_strategy,
                          poptorch.PipelinedExecution)
    else:
        opts.setExecutionStrategy(poptorch.ShardedExecution())

    opts.deviceIterations(device_iterations)

    poptorch_model = poptorch.inferenceModel(model, options=opts)

    if pipelined and device_iterations == 2:
        err_msg = (r"poptorch\.Options\(\)\.deviceIterations must be greater")
        with pytest.raises(poptorch.poptorch_core.Error, match=err_msg):
            poptorch_model(torch.zeros(device_iterations))
    else:
        poptorch_model(torch.zeros(device_iterations))


================================================
FILE: tests/bert_small_and_medium_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import transformers
import torch
import helpers
import poptorch


def test_bert_small():
    torch.manual_seed(42)

    # Bert small.
    pretrained_weights = 'mrm8488/bert-small-finetuned-squadv2'
    model = transformers.BertModel.from_pretrained(pretrained_weights,
                                                   torchscript=True)
    tokenizer = transformers.BertTokenizer.from_pretrained(pretrained_weights)

    # It *just* fits on one IPU but if the sequence length is too big it will need two.
    input_ids = torch.tensor([tokenizer.encode("E")])

    options = poptorch.Options()
    inference_model = poptorch.inferenceModel(model, options)
    poptorch_out = inference_model(input_ids)

    native = model(input_ids)

    for poptorchResult, nativeResult in zip(poptorch_out, native):
        helpers.assert_allclose(actual=poptorchResult,
                                expected=nativeResult,
                                rtol=1e-02,
                                atol=1e-02)


def test_bert_small_half():
    torch.manual_seed(42)

    # Bert small.
    pretrained_weights = 'mrm8488/bert-small-finetuned-squadv2'
    model = transformers.BertModel.from_pretrained(pretrained_weights,
                                                   torchscript=True)
    tokenizer = transformers.BertTokenizer.from_pretrained(pretrained_weights)

    # It *just* fits on one IPU but if the sequence length is too big it will need two.
    input_ids = torch.tensor([tokenizer.encode("E")])

    model.half()
    options = poptorch.Options()
    inference_model = poptorch.inferenceModel(model, options)
    poptorch_out = inference_model(input_ids)

    # Just check that we compile for now.
    assert poptorch_out[0].dtype == torch.half


def test_bert_medium_result():
    torch.manual_seed(42)

    pretrained_weights = 'mrm8488/bert-medium-finetuned-squadv2'

    # For later versions of transformers, we need to wrap the model and set
    # return_dict to False
    class WrappedModel(torch.nn.Module):
        def __init__(self):
            super().__init__()
            transformers_BFQA = transformers.BertForQuestionAnswering
            self.wrapped = transformers_BFQA.from_pretrained(
                'mrm8488/bert-medium-finetuned-squadv2')

        def forward(self, input_ids, attention_mask):
            return self.wrapped.forward(input_ids,
                                        attention_mask,
                                        return_dict=False)

        def __getattr__(self, attr):
            try:
                return torch.nn.Module.__getattr__(self, attr)
            except AttributeError:
                return getattr(self.wrapped, attr)

    model = WrappedModel()

    tokenizer = transformers.BertTokenizer.from_pretrained(
        pretrained_weights, return_token_type_ids=True)

    context = """Edinburgh is Scotland's compact, hilly capital."""
    question = "What is the capital of Scotland?"
    encoding = tokenizer.encode_plus(question, context)

    mask = encoding["attention_mask"]
    ins = encoding["input_ids"]
    input_ids = torch.tensor([ins, ins])

    attention_mask = torch.tensor([mask, mask])
    start_scores_native, end_scores_native = model(
        input_ids, attention_mask=attention_mask)

    opts = poptorch.Options()
    opts.deviceIterations(2)

    model.bert.embeddings.position_embeddings = poptorch.BeginBlock(
        model.bert.embeddings.position_embeddings, ipu_id=1)

    inference_model = poptorch.inferenceModel(model, opts)
    start_score_pop, end_scores_pop = inference_model(input_ids,
                                                      attention_mask)

    # Longer sequences begin to accumulate more floating point error.
    helpers.assert_allclose(expected=start_scores_native,
                            actual=start_score_pop,
                            rtol=1e-02,
                            atol=1e-02)
    helpers.assert_allclose(expected=end_scores_native,
                            actual=end_scores_pop,
                            rtol=1e-02,
                            atol=1e-02)

    assert torch.argmax(start_score_pop), torch.argmax(start_scores_native)
    assert torch.argmax(end_scores_pop), torch.argmax(end_scores_native)

    # Convert to string (Only check the first result as we've already established the two were identical)
    ans_tokens = ins[torch.argmax(start_score_pop[0]
                                  ):torch.argmax(end_scores_pop[0]) + 1]
    answer_tokens = tokenizer.convert_ids_to_tokens(ans_tokens)

    answer_tokens_to_string = tokenizer.convert_tokens_to_string(answer_tokens)

    assert answer_tokens_to_string == 'edinburgh'


================================================
FILE: tests/blas_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import pytest
import torch
import helpers
import poptorch


def blas_op(op, input1, input2, out, atol=1e-04, rtol=1e-04):
    class Model(torch.nn.Module):
        def __init__(self, op):
            super().__init__()
            self.op = op

        def forward(self, x, y, out=None):
            return self.op(x, y, out=out)

    model = Model(op)
    args = [input1, input2]
    if out is not None:
        args.append(out)

    native_out = None
    # Matmul fp16 is not supported on the CPU
    if input1.dtype != torch.half and input2.dtype != torch.half:
        # Run on CPU.
        native_out = model(*args)

    # Run on IPU.
    poptorch_model = poptorch.inferenceModel(model)
    poptorch_out = poptorch_model(*args)

    if native_out is not None:
        helpers.assert_allclose(expected=native_out,
                                actual=poptorch_out,
                                atol=atol,
                                rtol=rtol,
                                equal_nan=True)
    if out is not None and native_out is not None:
        helpers.assert_allclose(expected=native_out,
                                actual=out,
                                atol=atol,
                                rtol=rtol,
                                equal_nan=True)


@pytest.mark.parametrize("out", [True, False])
@pytest.mark.parametrize("shapes", [([10, 200], [200, 45], [10, 45]),
                                    ([10, 200], [200], [10]),
                                    ([200], [200, 45], [1, 45]),
                                    ([200], [200], [])])
def test_matmul(out, shapes):
    torch.manual_seed(42)

    if len(shapes[0]) == 1 and len(shapes[1]) == 1 and out:
        pytest.skip(
            "TODO(T71439) No shape inference handler for aten::fill_.Tensor")

    input1 = torch.randn(shapes[0])
    input2 = torch.randn(shapes[1])
    out = torch.randn(shapes[2]) if out else None

    blas_op(torch.matmul, input1, input2, out)


@pytest.mark.parametrize("mode",
                         (poptorch.MatMulSerializationMode.InputChannels,
                          poptorch.MatMulSerializationMode.ReducingDim,
                          poptorch.MatMulSerializationMode.OutputChannels,
                          poptorch.MatMulSerializationMode.Disabled))
@pytest.mark.parametrize("factor", (2, 5, 10))
@pytest.mark.parametrize("keep_precision", [True, False])
def test_serializedMatMul(mode, factor, keep_precision):
    torch.manual_seed(42)

    input1 = torch.rand(1, 10, 200)

    input2_dim = 45

    if mode == poptorch.MatMulSerializationMode.OutputChannels:
        # Ensure the value is a multiple of factor
        input2_dim = input2_dim // factor * factor

    input2 = torch.rand(200, input2_dim)

    def serialise_matmal_op(input, other, out):
        assert out is None
        return poptorch.serializedMatMul(input, other, mode, factor,
                                         keep_precision)

    if keep_precision:
        input1 = input1.half()
        input2 = input2.half()
        blas_op(serialise_matmal_op,
                input1,
                input2,
                None,
                rtol=0.01,
                atol=0.05)
    else:
        blas_op(serialise_matmal_op, input1, input2, None)


@pytest.mark.parametrize("optional_out", [True, False])
def test_bmm(optional_out):
    input1 = torch.randn([12, 10, 200])
    input2 = torch.randn([12, 200, 33])
    out = torch.randn([12, 10, 33]) if optional_out else None

    blas_op(torch.bmm, input1, input2, out)


@pytest.mark.parametrize("bias", [True, False])
def test_matmul_training(bias):
    N, M, K, C = 100, 9, 7, 5

    class Net(torch.nn.Module):
        def __init__(self):
            super().__init__()
            torch.manual_seed(42)
            self.linear = torch.nn.Linear(K, K, bias=bias)
            self.softmax = torch.nn.LogSoftmax(dim=1)
            self.loss = torch.nn.L1Loss(reduction="mean")

        def forward(self, x, y, target):
            x = self.linear(x)
            x = torch.matmul(x, y)
            return x, self.loss(x, target)

    torch.manual_seed(42)
    model = Net()

    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
    torch.manual_seed(42)
    poptorch_model = poptorch.trainingModel(model, optimizer=optimizer)
    x = torch.randn(N, M, K)
    y = torch.randn(K, K)
    target = torch.empty(N, M, K, dtype=torch.long).random_(0, C)

    for _ in range(0, 400):
        optimizer.zero_grad()
        poptorch_out, poptorch_loss = poptorch_model(x, y, target)
        native_out, native_loss = model(x, y, target)
        native_loss.backward(retain_graph=True)
        optimizer.step()

    helpers.assert_allclose(actual=poptorch_out,
                            expected=native_out,
                            rtol=1e-02,
                            atol=1e-02)
    helpers.assert_allclose(actual=poptorch_loss,
                            expected=native_loss,
                            rtol=1e-03,
                            atol=1e-03)


@pytest.mark.parametrize(
    "params",
    [
        # input_shape, beta, alpha
        ((3, 7), 1.0, 1.0),
        ((3, 1), 1.0, 0.75),
        ((1, 7), 0.75, 1.0),
        ((1), 0.75, 0.75),
    ])
def test_addmm(params):
    torch.manual_seed(42)

    input_shape, beta, alpha = params

    t1 = torch.randn(input_shape)
    t2 = torch.randn(3, 5)
    t3 = torch.randn(5, 7)

    class AddmmModel(torch.nn.Module):
        def __init__(self, beta, alpha):
            super().__init__()
            self.beta = beta
            self.alpha = alpha

        def forward(self, x1, x2, x3):
            return torch.addmm(x1, x2, x3, beta=self.beta, alpha=self.alpha)

    model = AddmmModel(beta, alpha)
    cpu_result = model(t1, t2, t3)
    ipu_result = poptorch.inferenceModel(model)(t1, t2, t3)

    helpers.assert_allclose(expected=cpu_result, actual=ipu_result)


@pytest.mark.parametrize(
    "params",
    [
        # input_shape, beta, alpha
        ((3, 7), 1.0, 1.0),
        ((3, 1), 1.0, 0.75),
        ((1, 7), 0.75, 1.0),
        ((1), 0.75, 0.75),
    ])
def test_baddbmm(params):
    torch.manual_seed(42)

    input_shape, beta, alpha = params

    t1 = torch.randn(input_shape)
    t2 = torch.randn(2, 3, 5)
    t3 = torch.randn(2, 5, 7)

    class AddmmModel(torch.nn.Module):
        def __init__(self, beta, alpha):
            super().__init__()
            self.beta = beta
            self.alpha = alpha

        def forward(self, x1, x2, x3):
            return torch.baddbmm(x1, x2, x3, beta=self.beta, alpha=self.alpha)

    model = AddmmModel(beta, alpha)
    cpu_result = model(t1, t2, t3)
    ipu_result = poptorch.inferenceModel(model)(t1, t2, t3)

    helpers.assert_allclose(expected=cpu_result, actual=ipu_result)


@pytest.mark.parametrize("input_shape", [(20, 10)])
@pytest.mark.parametrize("beta", [0, .5])
@pytest.mark.parametrize("alpha", [0, 1.5])
@pytest.mark.parametrize("use_out", [True, False])
def test_addmv(input_shape, beta, alpha, use_out):
    torch.manual_seed(42)

    mat = torch.randn(input_shape)
    vec = torch.randn(input_shape[1])
    inp = torch.randn(input_shape[0])

    if beta == 0:
        # NaNs in input should be ignored
        inp[0] = float('nan')
    if alpha == 0:
        # NaNs in vec or mat should be ignored
        mat[0, 0] = float('nan')
        vec[0] = float('nan')

    output = torch.empty(input_shape[0]) if use_out else None

    class AddmvModel(torch.nn.Module):
        def __init__(self, beta, alpha):
            super().__init__()
            self.beta = beta
            self.alpha = alpha

        def forward(self, inp, mat, vec, out=None):
            result = torch.addmv(inp,
                                 mat,
                                 vec,
                                 beta=self.beta,
                                 alpha=self.alpha,
                                 out=out)
            if self.beta == 0 and self.alpha == 0:
                # Avoid empty compute graph
                result += torch.zeros_like(inp)
            return result

    model = AddmvModel(beta, alpha)
    cpu_result = model(inp, mat, vec, out=output)
    ipu_result = poptorch.inferenceModel(model)(inp, mat, vec, output)

    helpers.assert_allclose(expected=cpu_result, actual=ipu_result)
    if use_out is True:
        helpers.assert_allclose(expected=cpu_result, actual=output)


================================================
FILE: tests/bool_support_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.

import torch
import pytest
import helpers
import poptorch

# Not need for mean or logsumexp
reduce_ops = [torch.sum, torch.prod]
test_tensors = [
    torch.tensor([1.0, 2.0, 3.1]),
    torch.tensor([1.1, 2.0, 3.0]),
    torch.tensor([0.0, 0.0, 0.0])
]


@pytest.mark.parametrize("op", reduce_ops)
@pytest.mark.parametrize("t_1", test_tensors)
@pytest.mark.parametrize("t_2", test_tensors)
def test_reduce_two_bool_types(op, t_1, t_2):
    class Model(torch.nn.Module):
        def forward(self, x, y):
            return op(x == y)

    model = Model()

    poptorch_model = poptorch.inferenceModel(model)
    native_out = model(t_1, t_2)
    poptorch_out = poptorch_model(t_1, t_2)
    #expected = no dims (scalar)
    helpers.assert_allclose(actual=poptorch_out, expected=native_out)

    assert native_out.dtype == torch.int64
    assert poptorch_out.dtype == torch.int32


def test_logits():
    class Model(torch.nn.Module):
        def forward(self, logits, y):
            acc = torch.sum(torch.argmax(logits, -1) == y) / float(y.size(0))
            return acc

    model = Model()

    logits = torch.tensor([[1.0, 2.0, 3.0], [3.0, 1.0, 2.0], [2.0, 3.0, 1.0]])
    y = torch.tensor([[0], [2], [1]])

    poptorch_model = poptorch.inferenceModel(model)
    native_out = model(logits, y)
    poptorch_out = poptorch_model(logits, y)

    helpers.assert_allclose(actual=poptorch_out, expected=native_out)


================================================
FILE: tests/buffers_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import torch
import pytest
import helpers
import poptorch


class ConstantBuffer(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.register_buffer('stuff', torch.tensor([1, 2, 3],
                                                   dtype=torch.int32))

    def forward(self, x):
        new_stuff = 1.0 + self.stuff
        return torch.sum(x + new_stuff)


def test_constant_buffer():
    model = ConstantBuffer()

    poptorch_model = poptorch.inferenceModel(model)
    assert poptorch_model(torch.tensor([2])) == 15


def test_constant_buffer_repeat():
    model = ConstantBuffer()

    poptorch_model = poptorch.inferenceModel(model)
    assert poptorch_model(torch.tensor([2])) == 15
    assert poptorch_model(torch.tensor([2])) == 15


class UpdatableBuffer(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.register_buffer("buffer_1", torch.ones(1))
        self.register_buffer("buffer_2", torch.ones(1))

    def forward(self, x):
        return torch.sum(x + self.buffer_1 + self.buffer_2)


def test_copy_named_buffer_to_device_single_buffer():
    model = UpdatableBuffer()
    options = poptorch.Options()
    options.updatableNamedBuffers(['buffer_1'])
    poptorch_model = poptorch.inferenceModel(model, options=options)
    x = torch.ones(3).float()

    assert poptorch_model(x) == 9

    poptorch_model.buffer_1.copy_(poptorch_model.buffer_1 + 1)
    poptorch_model.copyNamedBuffersToDevice()

    assert poptorch_model(x) == 12

    poptorch_model.buffer_2.copy_(poptorch_model.buffer_2 + 1)
    poptorch_model.copyNamedBuffersToDevice()

    assert poptorch_model(x) == 12


def test_copy_named_buffer_to_device_two_buffers():
    model = UpdatableBuffer()
    options = poptorch.Options()
    options.updatableNamedBuffers(['buffer_1', 'buffer_2'])
    poptorch_model = poptorch.inferenceModel(model, options=options)
    x = torch.ones(3).float()

    assert poptorch_model(x) == 9

    poptorch_model.buffer_1.copy_(poptorch_model.buffer_1 + 1)
    poptorch_model.copyNamedBuffersToDevice()

    assert poptorch_model(x) == 12

    poptorch_model.buffer_2.copy_(poptorch_model.buffer_2 + 1)
    poptorch_model.copyNamedBuffersToDevice()

    assert poptorch_model(x) == 15


def test_copy_named_buffer_to_device_no_opt():
    model = UpdatableBuffer()
    options = poptorch.Options()

    poptorch_model = poptorch.inferenceModel(model, options=options)
    x = torch.ones(3).float()

    assert poptorch_model(x) == 9

    poptorch_model.buffer_1.copy_(poptorch_model.buffer_1 + 1)
    with pytest.raises(poptorch.poptorch_core.Error):
        poptorch_model.copyNamedBuffersToDevice()

    assert poptorch_model(x) == 9


def test_copy_named_buffer_to_device_invalid_opt():
    model = UpdatableBuffer()
    options = poptorch.Options()
    options.updatableNamedBuffers(['non_existing_buffer'])

    poptorch_model = poptorch.inferenceModel(model, options=options)
    x = torch.ones(3).float()

    with pytest.raises(poptorch.poptorch_core.Error):
        poptorch_model(x)


def test_training_then_inference():
    momentum = 0.1

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.bn = torch.nn.BatchNorm1d(10, momentum=momentum)
            self.loss = torch.nn.MSELoss()

        def forward(self, x, target):
            y = self.bn(x)
            return y, self.loss(y, target)

    model = Model()

    input = torch.ones([4, 10], dtype=torch.float32)
    target = torch.ones([4, 10], dtype=torch.float32) + 1

    training_model = poptorch.trainingModel(model)

    training_model.compile(input, target)

    inference_model = poptorch.inferenceModel(model)

    inference_model.compile(input, target)


def test_buffer_implicit_copy():
    momentum = 0.1

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.bn = torch.nn.BatchNorm1d(10, momentum=momentum)
            self.loss = torch.nn.MSELoss()

        def forward(self, x, target):
            y = self.bn(x)
            return y, self.loss(y, target)

    model = Model()

    input = torch.ones([4, 10], dtype=torch.float32)
    target = torch.ones([4, 10], dtype=torch.float32) + 1

    poptorch_model = poptorch.trainingModel(model)

    poptorch_model(input, target)
    helpers.assert_allclose(actual=model.bn.running_mean,
                            expected=input[0, :] * momentum)

    poptorch_model.copyWeightsToHost()
    helpers.assert_allclose(actual=model.bn.running_mean,
                            expected=input[0, :] * momentum)


def test_error_on_remove_buffer():
    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.register_buffer('y', torch.tensor([2]))

        def forward(self, x):
            x = x + 1
            if 'y' in self._buffers:
                del self._buffers['y']
            return x

    model = Model()

    poptorch_model = poptorch.inferenceModel(model)

    error_msg = (r"Buffer y is removed from the model when calling the "
                 r"forward method\.")
    with pytest.raises(poptorch.Error, match=error_msg):
        poptorch_model(torch.tensor([5.0]))


def test_error_on_redefine_buffer():
    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.register_buffer('y', torch.tensor([2]))

        def forward(self, x):
            x = x + 1
            # pylint: disable=attribute-defined-outside-init
            self.y = x

    model = Model()

    poptorch_model = poptorch.inferenceModel(model)
    error_msg = (r"Buffer y is reassigned within the model when calling the "
                 r"forward method\. This is not supported\. Consider using "
                 r"self\.y\.copy_\(src\) to copy data "
                 r"from a source tensor, where src is the name of the "
                 r"source tensor\.")

    with pytest.raises(poptorch.Error, match=error_msg):
        poptorch_model(torch.tensor([5.0]))


class BufferUpdatingModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = torch.nn.Conv2d(2, 2, 1, padding=0)
        self.register_buffer("test_buff", torch.zeros([2],
                                                      dtype=torch.float32))

        self.loss = torch.nn.L1Loss()

    def forward(self, inp, target):
        x = self.conv(inp)

        with torch.no_grad():
            # pylint: disable=attribute-defined-outside-init
            self.test_buff += self.conv.bias[0]

        return x, self.loss(x, target)


@pytest.mark.parametrize("device_iterations", [1, 3, 5])
@pytest.mark.parametrize("gradient_accumulation", [1, 3, 5])
def test_buffer_update_with_param(device_iterations, gradient_accumulation):
    model = BufferUpdatingModel()
    model.conv.weight.data = torch.ones_like(model.conv.weight.data)
    model.conv.bias.data = torch.ones_like(model.conv.bias.data)
    opt = torch.optim.SGD(model.parameters(), lr=0.1)

    times_to_run = 10
    dummy_input = torch.ones([2, 2, 2, 2])
    dummy_target = torch.zeros_like(dummy_input)

    for _ in range(times_to_run * device_iterations):
        opt.zero_grad()
        for _ in range(gradient_accumulation):
            _, loss = model(dummy_input, dummy_target)

            # Match mean gradient_accumulation
            loss /= gradient_accumulation

            loss.backward()

        opt.step()

    model_bias = model.conv.bias.clone()
    model_test_buff = model.test_buff.clone()

    # pylint: disable=attribute-defined-outside-init
    model.test_buff = torch.zeros([2], dtype=torch.float32)
    model.conv.weight.data = torch.ones_like(model.conv.weight.data)
    model.conv.bias.data = torch.ones_like(model.conv.bias.data)

    # Check for proper cloning
    with pytest.raises(AssertionError):
        helpers.assert_allclose(expected=model_bias, actual=model.conv.bias)
    with pytest.raises(AssertionError):
        helpers.assert_allclose(expected=model_test_buff,
                                actual=model.test_buff)

    opts = poptorch.Options()
    opts.deviceIterations(device_iterations)
    opts.Training.gradientAccumulation(gradient_accumulation)

    dummy_input = torch.ones(
        [2 * device_iterations * gradient_accumulation, 2, 2, 2])
    dummy_target = torch.zeros_like(dummy_input)

    poptorch_model = poptorch.trainingModel(model,
                                            optimizer=torch.optim.SGD(
                                                model.parameters(), lr=0.1),
                                            options=opts)

    for _ in range(times_to_run):
        dummy_target = torch.zeros_like(dummy_input)
        poptorch_model(dummy_input, dummy_target)

    helpers.assert_allclose(expected=model_bias,
                            actual=poptorch_model.conv.bias)
    helpers.assert_allclose(expected=model_test_buff,
                            actual=poptorch_model.test_buff)


def test_failing_on_replicas():
    model = BufferUpdatingModel()

    opts = poptorch.Options()
    opts.replicationFactor(2)

    poptorch_model = poptorch.trainingModel(model,
                                            optimizer=torch.optim.SGD(
                                                model.parameters(), lr=0.1),
                                            options=opts)

    dummy_input = torch.ones([4, 2, 2, 2])
    dummy_target = torch.zeros_like(dummy_input)

    error_msg = (r"PopTorch does not support broadcasting buffers. "
                 r"If your model is able to tolerate buffers becoming "
                 r"out of sync between replicas, you can disable "
                 r"buffer broadcasting using "
                 r"poptorch.Options.broadcastBuffers\(False\).")

    with pytest.raises(poptorch.Error, match=error_msg):
        poptorch_model(dummy_input, dummy_target)


def test_constant_buffer_with_replicas():
    # This should not have an error as the buffer is constant
    model = ConstantBuffer()

    opts = poptorch.Options()
    opts.replicationFactor(2)

    poptorch_model = poptorch.inferenceModel(model, opts)
    poptorch_model(torch.tensor([1, 2]))


def test_no_input_but_one_buffer():
    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.register_buffer("x", torch.tensor([1.], dtype=torch.float))

        def forward(self):
            # pylint: disable=attribute-defined-outside-init,no-member
            self.x += 1.0
            return self.x

    model = Model()
    poptorch_model = poptorch.inferenceModel(model)

    assert poptorch_model() == 2.
    assert poptorch_model() == 3.
    assert poptorch_model() == 4.
    assert poptorch_model() == 5.


def test_unsynchronised_replicated_buffers():
    class ReplicaBufferModel(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.register_buffer("buffer", torch.zeros(1, 2))

        def forward(self, x):
            buffer_update = self.buffer + x
            self.buffer.copy_(buffer_update)
            return poptorch.identity_loss(self.buffer, reduction='none')

    num_replica = 2
    torch.manual_seed(43)
    opts = poptorch.Options()
    opts.replicationFactor(num_replica)
    opts.deviceIterations(1)
    opts.broadcastBuffers(False)

    model = ReplicaBufferModel()
    model.float()
    poptorch_model = poptorch.inferenceModel(model, opts)

    x = torch.tensor([[9], [2]])

    # Each replica update its buffer in place with a random value 50 times.
    for _ in range(50):
        y = poptorch_model(x)

    assert y[0][-1] == x[0] * 50
    assert y[1][-1] == x[1] * 50


================================================
FILE: tests/conftest.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import random
import enum
import os.path as osp
import gc
import pytest
import torch
import helpers
import numpy as np
import poptorch


@pytest.fixture(autouse=True)
def cleanup():
    # Explicitly clean up to make sure we detach from the IPU and free the graph
    # before the next test starts.
    gc.collect()


# Documentation about markers: https://docs.pytest.org/en/6.2.x/example/markers.html

hw_available = poptorch.ipuHardwareIsAvailable()


def pytest_make_parametrize_id(val, argname):
    if isinstance(val, enum.Enum):
        return f"{argname}:{val.name}"
    if val is None or isinstance(val, (bool, int, str, float, torch.dtype)):
        return f"{argname}:{val}"
    if isinstance(val, type):
        return f"{argname}:{val.__name__}"

    # Use default
    return None


def pytest_configure(config):
    config.addinivalue_line("markers",
                            ("ipuHardwareRequired: require IPU hardware to be "
                             "available on the platform"))
    config.addinivalue_line("markers",
                            ("excludeFromReducedTesting: exclude from "
                             "reduced testing runs"))
    config.addinivalue_line("markers",
                            ("extendedTestingOnly: to only include "
                             "in extended testing runs because it takes a "
                             "long time to run"))
    if config.getoption("collectonly"):
        helpers.is_running_tests = False
    helpers.running_reduced_testing = config.getoption("reduced_testing")


def pytest_runtest_setup(item):
    # Is it a test with parameters?
    if hasattr(item, 'callspec'):
        # Does it have a trace_model parameter ?
        trace_model = item.callspec.params.get("trace_model")
        if trace_model is not None:
            if trace_model:
                pytest.skip("Tracing is no longer supported: skipping.")

    if any(item.iter_markers("ipuHardwareRequired")):
        if not hw_available:
            pytest.skip("Hardware IPU needed to test this feature.")


# Source: https://raphael.codes/blog/customizing-your-pytest-test-suite-part-2/
def pytest_collection_modifyitems(session, config, items):  # pylint: disable=unused-argument
    # if --extended-tests is set: include all the tests with a
    # "extendedTestingOnly" marker (Even if --hw-tests-only is set).
    # if --hw-tests-only is set: only keep tests with a "ipuHardwareRequired"
    # marker.
    # if --no-hw-tests is set: keep only the other ones.
    hw_required = []
    hw_not_required = []
    force_include = []
    force_exclude = []
    include_extended = config.getoption("extended_tests")
    for item in items:
        if helpers.running_reduced_testing and any(
                item.iter_markers("excludeFromReducedTesting")):
            force_exclude.append(item)
        elif any(item.iter_markers("extendedTestingOnly")):
            if include_extended:
                force_include.append(item)
            else:
                force_exclude.append(item)
        elif any(item.iter_markers("ipuHardwareRequired")):
            hw_required.append(item)
        else:
            hw_not_required.append(item)
    if config.getoption("hw_tests_only"):
        config.hook.pytest_deselected(items=hw_not_required + force_exclude)
        items[:] = hw_required + force_include
    elif config.getoption("no_hw_tests"):
        config.hook.pytest_deselected(items=hw_required + force_exclude)
        items[:] = hw_not_required + force_include
    else:
        config.hook.pytest_deselected(items=force_exclude)
        items[:] = hw_required + hw_not_required + force_include


def pytest_sessionfinish(session, exitstatus):
    # Exit status 5 means no tests were collected -> this is not an error.
    # In our case this is not an error because some files might only contain
    # HW tests for example and therefore won't have any test to run if
    # --hw-tests-only is used.
    if exitstatus == 5:
        session.exitstatus = 0


def pytest_addoption(parser):
    parser.addoption("--hw-tests-only",
                     action="store_true",
                     default=False,
                     help="Only run HW tests")
    parser.addoption("--no-hw-tests",
                     action="store_true",
                     default=False,
                     help="Exclude all tests requiring HW")
    parser.addoption("--extended-tests",
                     action="store_true",
                     default=False,
                     help=("Include all tests marked with "
                           "'extendedTestingOnly' (Takes precedence over"
                           " --no-hw-tests)"))
    parser.addoption("--reduced-testing",
                     action="store_true",
                     default=False,
                     help=("Run some tests with a reduced "
                           "number of parameters"))
    parser.addoption("--seed",
                     type=int,
                     default=0,
                     help=("Set the seed for running the tests."))
    parser.addoption("--external-datasets-dir",
                     type=str,
                     default=osp.join(osp.dirname(osp.abspath(__file__)),
                                      ".datasets"),
                     help=("The directory where the external datasets will be "
                           "downloaded."))


@pytest.fixture(autouse=True, scope="function")
def random_seed(pytestconfig):
    """Set the random seed for all tests in this directory. autouse=True will
    use this fixture in every test. Seed can be overridden with --seed on the
    command line to alter the seed for testing purposes. By default uses 0 for
    all tests.
    """
    seed = 0
    if hasattr(pytestconfig, "seed"):
        seed = pytestconfig.seed
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)


================================================
FILE: tests/convs_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import unittest.mock
import re
import torch
import pytest
import helpers
import poptorch

# Convolutions.

convolutions = [
    torch.nn.Conv1d,
    torch.nn.Conv2d,
    torch.nn.Conv3d,
    torch.nn.ConvTranspose1d,
    torch.nn.ConvTranspose2d,
    torch.nn.ConvTranspose3d,
    torch.nn.Unfold,
    torch.nn.Fold,
]

padding_modes = ['zeros', 'reflect', 'replicate', 'circular']

# Unsupported
folds = []  # torch.nn.Unfold, torch.nn.Fold,

# Supported.
conv_1D = [torch.nn.Conv1d, torch.nn.ConvTranspose1d]
conv_2D = [torch.nn.Conv2d, torch.nn.ConvTranspose2d]
conv_3D = [torch.nn.Conv3d, torch.nn.ConvTranspose3d]


def execute_and_check_wrapper(op, input, training=True, rtol=0.01, atol=0.01):
    if hasattr(op, 'padding_mode') and op.padding_mode != 'zeros':
        pytest.skip("TODO(T25617): PopART does not support PadGradOp when"
                    " mode is not 'constant'")

    model = helpers.ModelWithWeights(op,
                                     input.shape,
                                     loss_fn=torch.nn.L1Loss(reduction='mean'),
                                     out_fn=lambda x: (x, torch.zeros_like(x)))

    if training:
        optimizer = poptorch.optim.SGD(model.parameters(), lr=0.01)
        poptorch_model = poptorch.trainingModel(model, optimizer=optimizer)

        try:
            has_own_weight = any(
                n == 'weight'
                for (n, p) in poptorch_model.op.named_parameters())
        except AttributeError:
            has_own_weight = False

        if has_own_weight:
            weights_before = poptorch_model.op.weight.detach().clone()

        input = torch.ones_like(input)
        for _ in range(5):
            poptorch_out, loss = poptorch_model((input, ))

        if has_own_weight:
            model.op.weight.data = weights_before

        # pylint: disable=protected-access
        model.lin.weight.data = model._weights_before
        for _ in range(5):
            optimizer.zero_grad()
            native_out, loss = model((input, ))
            loss.backward()
            optimizer.step()

        # Inference test - check outputs
        helpers.assert_allclose(actual=poptorch_out,
                                expected=native_out,
                                rtol=rtol,
                                atol=atol)
    else:
        poptorch_model = poptorch.inferenceModel(model)
        # Run on CPU.
        native_out, _ = model((input, ))

        # Run on IPU.
        poptorch_out, _ = poptorch_model((input, ))
        helpers.assert_allclose(actual=poptorch_out,
                                expected=native_out,
                                rtol=rtol,
                                atol=atol)


@pytest.mark.parametrize("op", conv_1D)
@pytest.mark.parametrize("padding_mode", padding_modes)
@pytest.mark.parametrize("training", [True, False])
def test_conv1D(op, padding_mode, training):
    # This combination doesn't exist in upstream Torch:
    # ValueError: Only "zeros" padding mode is supported for ConvTranspose1d
    if (op is torch.nn.ConvTranspose1d and padding_mode != 'zeros'):
        return
    torch.manual_seed(42)
    C_IN = 4
    C_OUT = 8

    input = torch.randn(1, C_IN, 10)
    # With square kernels and equal stride
    model = op(C_IN, C_OUT, 3, stride=2, padding_mode=padding_mode)
    execute_and_check_wrapper(model, input, training)

    if op is torch.nn.ConvTranspose1d:
        # 'popart_exception': Non default value for dilations is not supported.
        return

    # non-square kernels and unequal stride and with padding and dilation
    model = op(C_IN,
               C_OUT, (3),
               stride=(2),
               padding=(4),
               dilation=(3),
               padding_mode=padding_mode)
    execute_and_check_wrapper(model, input, training)


@pytest.mark.parametrize("op", conv_2D)
@pytest.mark.parametrize("padding_mode", padding_modes)
@pytest.mark.parametrize("training", [True, False])
def test_conv2D(op, padding_mode, training):
    if (op is torch.nn.ConvTranspose2d and padding_mode != 'zeros') or \
       padding_mode == 'circular': # TODO(T31811)
        pytest.skip('skipping unsupported padding_mode')
    torch.manual_seed(42)
    C_IN = 4
    C_OUT = 2
    input = torch.randn(1, C_IN, 8, 10)

    # With square kernels and equal stride
    model = op(C_IN, C_OUT, 3, stride=2, padding_mode=padding_mode)
    execute_and_check_wrapper(model, input, training, rtol=0.1, atol=0.1)

    # Grouped convolutions.

    model = op(C_IN,
               C_OUT, (3, 5),
               stride=2,
               groups=2,
               padding_mode=padding_mode)
    execute_and_check_wrapper(model, input, training, rtol=0.1, atol=0.1)

    # Rectangular padding/stride
    if op is not torch.nn.ConvTranspose2d:
        # non-square kernels and unequal stride and with padding
        model = op(C_IN, C_OUT, (3, 5), stride=(2, 1), padding=(4, 2))
        execute_and_check_wrapper(model, input, training=False)

        # non-square kernels and unequal stride and with padding and dilation
        model = op(C_IN,
                   C_OUT, (3, 5),
                   stride=(2, 1),
                   padding=(4, 2),
                   dilation=(3),
                   padding_mode=padding_mode)
        execute_and_check_wrapper(model, input, training, rtol=0.01, atol=0.05)


@pytest.mark.parametrize("op", conv_3D)
@pytest.mark.parametrize("padding_mode", padding_modes)
@pytest.mark.parametrize("training", [True, False])
def test_conv3D(op, padding_mode, training):
    if (op is torch.nn.ConvTranspose3d and padding_mode != 'zeros') or \
       (op is torch.nn.Conv3d and padding_mode == 'reflect') or \
       padding_mode == 'circular': # TODO(T31811)
        pytest.skip('skipping unsupported padding_mode')

    torch.manual_seed(42)
    C_IN = 4
    C_OUT = 2
    input = torch.randn(1, C_IN, 3, 5, 8)

    # With square kernels and equal stride
    model = op(C_IN, C_OUT, 3, stride=2, padding_mode=padding_mode)
    execute_and_check_wrapper(model, input, training, rtol=0.1, atol=0.1)

    # Grouped convolutions.
    model = op(C_IN, C_OUT, 3, stride=2, groups=2, padding_mode=padding_mode)
    execute_and_check_wrapper(model, input, training, rtol=0.1, atol=0.1)

    if op is torch.nn.ConvTranspose3d:
        #  test output padding
        model = op(C_IN,
                   C_OUT, (3, 2, 2),
                   stride=(2, 1, 1),
                   groups=2,
                   output_padding=[1, 0, 0],
                   padding_mode=padding_mode)
        execute_and_check_wrapper(model, input, training, rtol=0.05, atol=0.05)
    else:
        # non-square kernels and unequal stride and with padding
        model = op(C_IN,
                   C_OUT, (3, 2, 2),
                   stride=(2, 1, 1),
                   padding=(4, 2, 0),
                   padding_mode=padding_mode)

        execute_and_check_wrapper(model, input, training, rtol=0.1, atol=0.1)

        # non-square kernels and unequal stride and with padding and dilation
        model = op(C_IN,
                   C_OUT, (3, 4, 2),
                   stride=(2, 1, 1),
                   padding=(4, 2, 0),
                   dilation=(3, 1, 1))

        execute_and_check_wrapper(model, input, training, rtol=0.1, atol=0.1)


def merge_dicts(x, y):
    z = x.copy()
    z.update(y)
    return z


# The test is reliant on an IPU model with limited memory, so force the small model
@pytest.mark.skip(reason="TODO(AFS-343)")
@helpers.printCapfdOnExit
@unittest.mock.patch.dict("os.environ",
                          merge_dicts(helpers.forceSmallModel(),
                                      {"POPLIBS_LOG_LEVEL": "DEBUG"}))
def test_available_memory(capfd):
    seen_length = 0

    def get_mem_prop_for_conv(op):
        nonlocal seen_length

        torch.manual_seed(42)
        input = torch.randn(1, 4, 10, 10)

        model = helpers.ModelWithWeights(
            op,
            input.shape,
            loss_fn=torch.nn.L1Loss(reduction='mean'),
            out_fn=lambda x: (x, torch.zeros_like(x)))

        poptorch.inferenceModel(model).compile((input, ))

        _, log = capfd.readouterr()

        m = re.search(r"availableMemoryProportion\ +([\d.]+)", log)

        assert m

        return float(m.group(1))

    model = torch.nn.Conv2d(4, 16, 10, stride=1)
    default_prop_for_conv = get_mem_prop_for_conv(model)

    model.register_forward_hook(lambda _1, _2, conv: poptorch.
                                set_available_memory(conv, 0.5))

    adjusted_prop_for_conv = get_mem_prop_for_conv(model)

    # The default value for available_memory should be more than 0.5 meaning
    # the default memory available for the convolution should be more than
    # after we adjusted the available memory
    assert default_prop_for_conv > adjusted_prop_for_conv
    assert adjusted_prop_for_conv == 0.5


@pytest.mark.parametrize("mode", poptorch.MatMulSerializationMode)
def test_matmul_serialization(mode):
    torch.manual_seed(42)

    input_channels = 6
    reducing_dim = 2
    output_channels = 4
    lhs = torch.randn(input_channels, reducing_dim)
    rhs = torch.randn(reducing_dim, output_channels)
    if mode == poptorch.MatMulSerializationMode.Disabled:
        factor = 0
    elif mode == poptorch.MatMulSerializationMode.InputChannels:
        factor = 2
    elif mode == poptorch.MatMulSerializationMode.ReducingDim:
        factor = 2
    elif mode == poptorch.MatMulSerializationMode.OutputChannels:
        factor = 4
    else:
        assert False, "Invalid mode"

    class BasicNetwork(torch.nn.Module):
        def forward(self, x, y):
            out = poptorch.serializedMatMul(x,
                                            y,
                                            mode,
                                            factor,
                                            keep_precision=True)
            return out

    # Just check we don't explode when the value is set.
    model = BasicNetwork()
    native_out = model(lhs, rhs)
    poptorch_model = poptorch.inferenceModel(model)
    poptorch_out = poptorch_model(lhs, rhs)

    helpers.assert_allclose(actual=poptorch_out, expected=native_out)


def test_available_memory_automatic():
    torch.manual_seed(42)

    # Just check we don't explode when the value is set.
    class Network(torch.nn.Module):
        def __init__(self):
            super().__init__()

            self.layer1 = torch.nn.Sequential(torch.nn.Conv2d(1, 10, 5),
                                              torch.nn.MaxPool2d(2),
                                              torch.nn.ReLU())
            self.layer2 = torch.nn.Sequential(torch.nn.Conv2d(10, 20, 5),
                                              torch.nn.MaxPool2d(2),
                                              torch.nn.ReLU())
            self.layer3 = torch.nn.Linear(320, 256)
            self.layer3_act = torch.nn.ReLU()
            self.layer4 = torch.nn.Linear(256, 10)

            self.softmax = torch.nn.LogSoftmax(1)

        def forward(self, x):
            x = self.layer1(x)
            x = self.layer2(x)
            x = x.view(-1, 320)

            x = self.layer3_act(self.layer3(x))
            x = self.layer4(x)
            x = self.softmax(x)
            return x

    model = Network()
    # Run on CPU.
    input = torch.randn(2, 1, 28, 28)
    native_out = model(input)

    # Run on IPU.
    opts = poptorch.Options()
    opts.setAvailableMemoryProportion(available_memory_proportion={
        "IPU0": 0.7,
        "IPU1": 0.2
    })

    poptorch_model = poptorch.inferenceModel(model, opts)
    poptorch_out = poptorch_model(input)

    helpers.assert_allclose(actual=poptorch_out, expected=native_out)


@pytest.mark.parametrize("dim", range(-3, 3))
@pytest.mark.parametrize("training", [True, False])
def test_cumsum(dim, training):
    torch.manual_seed(42)
    op = lambda x: torch.cumsum(x, dim=dim)
    input = torch.randn(1, 5, 6, dtype=torch.float32)

    execute_and_check_wrapper(op, input, training, rtol=0.02, atol=0.02)


@pytest.mark.parametrize("dim", range(-3, 3))
@pytest.mark.parametrize("training", [True, False])
def test_cumprod(dim, training):
    torch.manual_seed(42)
    input = torch.randn(1, 5, 6, dtype=torch.float32)
    op = lambda x: torch.cumprod(x, dim=dim)
    execute_and_check_wrapper(op, input, training, rtol=0.02, atol=0.02)


@pytest.mark.parametrize("src_dtype", [torch.float, torch.int])
@pytest.mark.parametrize("dest_dtype", [torch.float, torch.int])
@pytest.mark.parametrize("dim", range(-1, 1))
def test_cumsum_changing_types(src_dtype, dest_dtype, dim):
    class Model(torch.nn.Module):
        def forward(self, inp):
            return inp.cumsum(dim=dim, dtype=dest_dtype)

    cpu_model = Model()
    ipu_model = poptorch.inferenceModel(cpu_model)

    torch.manual_seed(42)
    inp = torch.randn(1, 5, 6).to(src_dtype)

    helpers.assert_allclose(actual=ipu_model(inp), expected=cpu_model(inp))


# The free-function, `out=` form of `cumsum` works a bit differently to the
# method form.
@pytest.mark.parametrize("src_dtype", [torch.float, torch.int])
@pytest.mark.parametrize("dest_dtype", [torch.float, torch.int])
@pytest.mark.parametrize("dim", range(-1, 1))
def test_cumsum_changing_types_out(src_dtype, dest_dtype, dim):
    class Model(torch.nn.Module):
        def forward(self, inp):
            res = torch.empty(inp.shape).to(dest_dtype)
            return torch.cumsum(inp, dim=dim, out=res)

    cpu_model = Model()
    ipu_model = poptorch.inferenceModel(cpu_model)

    torch.manual_seed(42)
    inp = torch.randn(1, 5, 6).to(src_dtype)

    helpers.assert_allclose(actual=ipu_model(inp), expected=cpu_model(inp))


# Test that the result of `cumsum` can be passed forward without loss of tensor
# shape metadata.
@pytest.mark.parametrize("src_dtype", [torch.float, torch.int])
@pytest.mark.parametrize("dest_dtype", [torch.float, torch.int])
@pytest.mark.parametrize("dim", range(-1, 1))
def test_cumsum_can_pass_on(src_dtype, dest_dtype, dim):
    class Model(torch.nn.Module):
        def forward(self, inp):
            return inp.cumsum(dim=dim, dtype=dest_dtype)[:, -1]

    ipu_model = poptorch.inferenceModel(Model())

    torch.manual_seed(42)
    inp = torch.randn(1, 5, 6).to(src_dtype)

    # Just test it doesn't fail
    try:
        ipu_model(inp)
    except poptorch.poptorch_core.Error as _:
        assert False, "Passing the result of torch.cumsum onwards failed."


================================================
FILE: tests/cpp/CMakeLists.txt
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

set(Boost_USE_STATIC_LIBS OFF)
set(Boost_USE_MULTITHREADED ON)
set(Boost_USE_STATIC_RUNTIME OFF)
find_package(Boost 1.70 REQUIRED COMPONENTS unit_test_framework)

# Ensure ABI matches that of PyTorch
add_definitions(${TORCH_CXX_FLAGS})

function(add_unit_test name)
  add_executable(${name} ${ARGN})

  target_link_libraries(${name} Boost::unit_test_framework torch poptorch poptorch_logging pthread)

  target_include_directories(${name} PRIVATE
    ${CMAKE_SOURCE_DIR}/poptorch/source/include/)

  add_test(${name} ${name})

endfunction()

add_unit_test(GNNOptimizationsTest GNNOptimizationsTest.cpp)


================================================
FILE: tests/cpp/GNNOptimizationsTest.cpp
================================================
// Copyright (c) 2023 Graphcore Ltd. All rights reserved.

#define BOOST_TEST_MODULE GNNOptimizationsTest
#include <boost/test/included/unit_test.hpp>

#include <torch/csrc/jit/ir/ir.h>
#include <torch/csrc/jit/ir/irparser.h>

#include "poptorch/OpBuilder.hpp"
#include "poptorch/PopartCanonicalization.hpp"
#include "poptorch/TypeAndConstantCanonicalization.hpp"

#define CHECK_OPS_IN_GRAPH(graph_str, op)                                      \
  BOOST_CHECK_EQUAL(occurrences(graph_str, std::string(":").append(#op)), op);

int occurrences(const std::string &graph, const std::string &phrase) {
  int occurrs = 0;
  std::string::size_type position = 0;
  while ((position = graph.find(phrase, position)) != std::string::npos) {
    occurrs++;
    position += phrase.length();
  }
  return occurrs;
}

std::string parseGraphToStr(torch::jit::Graph *graph) {
  std::stringstream output_ir_stream;
  for (auto *node : graph->nodes()) {
    node->print(output_ir_stream, 0, nullptr, true, false, false, false);
  }
  return output_ir_stream.str();
}

void checkIsReturnUpdated(torch::jit::Graph *graph) {
  torch::jit::Node *output = graph->outputs()[0]->node();
  std::stringstream output_ir_stream;
  output->print(output_ir_stream, 0, nullptr, true, false, false, false);
  // Return from scatterreduce should be replaced by squeeze from grouped
  // version.
  BOOST_CHECK_EQUAL(occurrences(output_ir_stream.str(), "squeeze"), 1);
}

BOOST_AUTO_TEST_CASE(GroupScatterReduceAndGatherNodes0) {
  auto graph = std::make_shared<torch::jit::Graph>();
  const std::string input =
      R"IR(
    graph():
        %1  : Float(requires_grad=0, device=cpu) = poptorch::tensor_constant()
        %2  : Int(requires_grad=0, device=cpu) = poptorch::tensor_constant()
        %3  : Float(requires_grad=0, device=cpu) = poptorch::tensor_constant()
        %4  : Int(requires_grad=0, device=cpu) = poptorch::tensor_constant()
        %5  : Float(2, strides=[1], requires_grad=0, device=cpu) = popart::scatterreduce[axis_size=0, axis=0, reduction=0, enable_index_broadcast=1](%1, %2)
        %6  : Float(2, strides=[1], requires_grad=0, device=cpu) = popart::scatterreduce[axis_size=0, axis=0, reduction=0, enable_index_broadcast=1](%3, %4)
        %7  : Float(2, strides=[1], requires_grad=0, device=cpu) = popart::scatterreduce[axis_size=0, axis=0, reduction=0, enable_index_broadcast=1](%5, %6)
        %8  : Float(2, strides=[1], requires_grad=0, device=cpu) = popart::gather[axis=0](%1, %2)
        %9  : Float(2, strides=[1], requires_grad=0, device=cpu) = popart::gather[axis=0](%3, %4)
        %10 : Float(2, strides=[1], requires_grad=0, device=cpu) = popart::gather[axis=0](%5, %6)
        return(%6)
  )IR";
  parseIR(input, graph.get());
  poptorch::groupScatterReduceAndGatherNodes(graph.get());
  constexpr std::size_t tensor_constant = 4;
  constexpr std::size_t unsqueeze = 8;
  constexpr std::size_t concat = 4;
  constexpr std::size_t groupedscatterreduce = 1;
  constexpr std::size_t groupedgather = 1;
  constexpr std::size_t scatterreduce = 1;
  constexpr std::size_t gather = 1;
  constexpr std::size_t slice = 4;
  constexpr std::size_t squeeze = 4;

  std::string output_ir = parseGraphToStr(graph.get());

  CHECK_OPS_IN_GRAPH(output_ir, tensor_constant);
  CHECK_OPS_IN_GRAPH(output_ir, unsqueeze);
  CHECK_OPS_IN_GRAPH(output_ir, concat);
  CHECK_OPS_IN_GRAPH(output_ir, groupedscatterreduce);
  CHECK_OPS_IN_GRAPH(output_ir, scatterreduce);
  CHECK_OPS_IN_GRAPH(output_ir, groupedgather);
  CHECK_OPS_IN_GRAPH(output_ir, gather);
  CHECK_OPS_IN_GRAPH(output_ir, slice);
  CHECK_OPS_IN_GRAPH(output_ir, squeeze);
  checkIsReturnUpdated(graph.get());
}

BOOST_AUTO_TEST_CASE(GroupScatterReduceAndGatherNodes1) {
  auto graph = std::make_shared<torch::jit::Graph>();
  const std::string input =
      R"IR(
    graph():
        %1  : Float(requires_grad=0, device=cpu) = poptorch::tensor_constant()
        %2  : Int(requires_grad=0, device=cpu) = poptorch::tensor_constant()
        %3  : Float(requires_grad=0, device=cpu) = poptorch::tensor_constant()
        %4  : Int(requires_grad=0, device=cpu) = poptorch::tensor_constant()
        %5  : Float(requires_grad=0, device=cpu) = poptorch::tensor_constant()
        %6  : Int(requires_grad=0, device=cpu) = poptorch::tensor_constant()
        %7  : Float(2, strides=[1], requires_grad=0, device=cpu) = popart::scatterreduce[axis_size=0, axis=0, reduction=0, enable_index_broadcast=1](%1, %2)
        %8  : Float(2, strides=[1], requires_grad=0, device=cpu) = popart::scatterreduce[axis_size=0, axis=0, reduction=0, enable_index_broadcast=1](%3, %4)
        %9  : Float(2, strides=[1], requires_grad=0, device=cpu) = popart::scatterreduce[axis_size=0, axis=1, reduction=0, enable_index_broadcast=1](%5, %6)
        %10 : Float(2, strides=[1], requires_grad=0, device=cpu) = popart::gather[axis=0](%1, %2)
        %11 : Float(2, strides=[1], requires_grad=0, device=cpu) = popart::gather[axis=0](%3, %4)
        %12 : Float(2, strides=[1], requires_grad=0, device=cpu) = popart::gather[axis=1](%5, %6)
        return(%8)
  )IR";
  parseIR(input, graph.get());
  poptorch::groupScatterReduceAndGatherNodes(graph.get());
  constexpr std::size_t tensor_constant = 6;
  constexpr std::size_t unsqueeze = 8;
  constexpr std::size_t concat = 4;
  constexpr std::size_t groupedscatterreduce = 1;
  constexpr std::size_t groupedgather = 1;
  constexpr std::size_t scatterreduce = 1;
  constexpr std::size_t gather = 1;
  constexpr std::size_t slice = 4;
  constexpr std::size_t squeeze = 4;

  std::string output_ir = parseGraphToStr(graph.get());

  CHECK_OPS_IN_GRAPH(output_ir, tensor_constant);
  CHECK_OPS_IN_GRAPH(output_ir, unsqueeze);
  CHECK_OPS_IN_GRAPH(output_ir, concat);
  CHECK_OPS_IN_GRAPH(output_ir, groupedscatterreduce);
  CHECK_OPS_IN_GRAPH(output_ir, scatterreduce);
  CHECK_OPS_IN_GRAPH(output_ir, groupedgather);
  CHECK_OPS_IN_GRAPH(output_ir, gather);
  CHECK_OPS_IN_GRAPH(output_ir, slice);
  CHECK_OPS_IN_GRAPH(output_ir, squeeze);
  checkIsReturnUpdated(graph.get());
}


================================================
FILE: tests/cpu_op_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import torch
import helpers
import poptorch


def test_simple_CPU():
    torch.manual_seed(42)

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.cpu = poptorch.CPU(self.foo, "MyCPUOp")

        def foo(self, x, y):
            return x * y

        def forward(self, x, y):
            w = self.cpu(x, y)
            return w * 3.0

    model = Model()
    inference_model = poptorch.inferenceModel(model)

    in1 = torch.randn([5, 2, 3, 5])
    in2 = torch.tensor([2.0])

    out = inference_model(in1, in2)

    helpers.assert_allclose(actual=out, expected=in1 * 6.0, equal_nan=True)

    in2 = torch.tensor([4.0])

    out = inference_model(in1, in2)

    helpers.assert_allclose(actual=out, expected=in1 * 12.0, equal_nan=True)


def test_simple_CPU_multiple_outputs():
    torch.manual_seed(42)

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.cpu = poptorch.CPU(self.foo, "MyCPUOp")

        def foo(self, x, y):
            return x * y, x + y

        def forward(self, x, y):
            w, z = self.cpu(x, y)
            return w * 3.0, z

    model = Model()
    inference_model = poptorch.inferenceModel(model)

    in1 = torch.randn([5, 2, 3, 5])
    in2 = torch.tensor([2.0])

    out, out2 = inference_model(in1, in2)

    helpers.assert_allclose(actual=out, expected=in1 * 6.0, equal_nan=True)

    helpers.assert_allclose(actual=out2, expected=(in1 + in2), equal_nan=True)

    in2 = torch.tensor([4.0])

    out, out2 = inference_model(in1, in2)

    helpers.assert_allclose(actual=out, expected=in1 * 12.0, equal_nan=True)
    helpers.assert_allclose(actual=out2, expected=(in1 + in2), equal_nan=True)


def test_CPU_reduce():
    torch.manual_seed(42)

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.cpu = poptorch.CPU(self.foo, "MyCPUOp")

        def foo(self, x):
            return torch.mean(x)

        def forward(self, x):
            w = self.cpu(x)
            return w

    model = Model()
    inference_model = poptorch.inferenceModel(model)

    in1 = torch.randn([5, 2, 3, 5])
    out = inference_model(in1)

    helpers.assert_allclose(actual=out,
                            expected=torch.mean(in1),
                            equal_nan=True)


def test_CPU_matmul():
    torch.manual_seed(42)

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()

            self.matmul = [torch.nn.Linear(20, 30)]

            self.cpu = poptorch.CPU(self.matmul[0], "MatMulOnCPU")

        def forward(self, input):
            return self.cpu(input)

    model = Model()
    inference_model = poptorch.inferenceModel(model)

    input = torch.randn(128, 20)
    out = inference_model(input)

    helpers.assert_allclose(actual=out,
                            expected=model.matmul[0](input),
                            equal_nan=True)


def test_CPU_multiple_calls():
    torch.manual_seed(42)

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.cpu = poptorch.CPU(self.foo, "MyCPUOp")

        def foo(self, x):
            assert x.device.type == "cpu", x.device.type
            return x * 2.0

        def forward(self, x):
            out = self.cpu(x)
            out = self.cpu(out)
            out = self.cpu(out)
            return out

    model = Model()
    inference_model = poptorch.inferenceModel(model)

    in1 = torch.randn([5, 2, 3, 5])
    out = inference_model(in1)

    helpers.assert_allclose(actual=out, expected=in1 * 8.0, equal_nan=True)


def test_CPU_multiple_calls_multiple_classes():
    torch.manual_seed(42)

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.cpu = poptorch.CPU(self.foo, "MyCPUOp")
            self.cpu2 = poptorch.CPU(self.bar, "MyCPUOp2")

        def foo(self, x):
            return x * 2.0

        def bar(self, x, y):
            return x + y

        def forward(self, x, y):
            out = self.cpu(x)
            out = self.cpu2(out, y)

            out = self.cpu(out)
            out = self.cpu2(out, y)

            out = self.cpu(out)
            out = self.cpu2(out, y)

            return out

    model = Model()
    inference_model = poptorch.inferenceModel(model)

    in1 = torch.randn([5])
    in2 = torch.randn([5])

    out = inference_model(in1, in2)

    helpers.assert_allclose(actual=out,
                            expected=model(in1, in2),
                            equal_nan=True)


# Just test that the dispatcher is disabled in the CPU op, and re-enabled
# afterwards.
def test_poptorch_op_in_cpu_op():
    torch.manual_seed(42)

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.cpu = poptorch.CPU(self.foo, "MyCPUOp")

        def foo(self, x):
            return poptorch.identity_loss(x, reduction='sum')

        def forward(self, x):
            w = self.cpu(x)
            return w, self.foo(x)

    options = poptorch.Options()
    options.deviceIterations(2)

    dispatched_model = poptorch.inferenceModel(Model(), options)

    # Just check it doesn't crash
    dispatched_model(torch.tensor([1.0, 2.0]))


================================================
FILE: tests/ctc_decoder_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.

import torch
import poptorch


class SimpleModel(torch.nn.Module):
    def forward(self, log_probs, lengths):
        return poptorch.ctc_beam_search_decoder(log_probs, lengths)


def test_ctc_decoder():
    input_size = 9
    batch_size = 3
    num_classes = 10

    torch.manual_seed(42)
    log_probs = torch.randn(input_size, batch_size, num_classes)
    lengths = torch.randint(5, input_size, (batch_size, ), dtype=torch.int)

    model = SimpleModel()
    poptorch_model = poptorch.inferenceModel(model)

    result = poptorch_model(log_probs, lengths)

    # note we have no reference implementation so only the most basic
    # test is possible - relying on popart/poplibs which are validated
    # against tensorflow's implementation
    assert result[0].shape == (batch_size, 1)
    assert result[1].shape == (batch_size, 1)
    assert result[2].shape == (batch_size, 1, input_size)


================================================
FILE: tests/custom_loss_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import torch
import torch.optim as optim
import pytest
import helpers
import poptorch


#  Test the reductions work as expected
@pytest.mark.parametrize("reduction", ["none", "mean", "sum"])
def test_non_final_loss_reductions(reduction):
    torch.manual_seed(42)

    base_model = torch.nn.Linear(10, 10)

    class CustomLoss(torch.nn.Module):
        # Mean squared error scaled.
        def forward(self, x, target):
            partial_loss = poptorch.identity_loss(x - target,
                                                  reduction=reduction)
            loss = partial_loss * partial_loss * 5
            return partial_loss, poptorch.identity_loss(loss, reduction="mean")

    loss_fn = CustomLoss()

    class ModelWithLoss(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.base_model = base_model

        def forward(self, data, target):
            out = base_model(data)
            loss = loss_fn(out, target)
            return out, loss

    model = ModelWithLoss()
    poptorch_model = poptorch.trainingModel(model)

    target = torch.randn(10)
    input = torch.randn(10)

    # Capture what the loss function will see before the loss changes.
    x, _ = model(input, target)
    _, (partial_loss, _) = poptorch_model(input, target)

    # Check we have actually reduced the loss
    if reduction != "none":
        assert torch.numel(partial_loss) == 1

    if reduction == "mean":
        simulated_loss = torch.mean(x - target)
    elif reduction == "sum":
        simulated_loss = torch.sum(x - target)
    elif reduction == "none":
        simulated_loss = x - target

    helpers.assert_allclose(expected=simulated_loss.reshape_as(partial_loss),
                            actual=partial_loss,
                            rtol=1e-02,
                            atol=1e-02)


# Test custom loss by training to targets
def run_custom_loss_test(loss_fn,
                         base_model=None,
                         input=None,
                         target=None,
                         test_output_vs_target=True):
    torch.manual_seed(42)

    if base_model is None:
        base_model = torch.nn.Linear(10, 10)
    if input is None:
        input = torch.randn(1, 10)
    if target is None:
        target = torch.randint(0, 10, [1])

    class ModelWithLoss(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.base_model = base_model
            self.loss_fn = loss_fn

        def forward(self, data, target):
            out = base_model(data)
            loss = self.loss_fn(out, target)
            return out, loss

    model = ModelWithLoss()
    poptorch_model = poptorch.trainingModel(model)

    # Pytorch native.
    native_out, loss = model(input, target)

    #Make sure the first run doesn't already pass the test.
    original, original_loss = poptorch_model(input, target)

    assert original_loss > 0.1

    if test_output_vs_target:
        assert not torch.allclose(native_out, target, rtol=1e-02, atol=1e-02)
        assert not torch.allclose(original, target, rtol=1e-02, atol=1e-02)

    for _ in range(0, 2500):
        out, loss = poptorch_model(input, target)

    # Check we have trained the "model"
    assert loss < 0.1

    if test_output_vs_target:
        helpers.assert_allclose(actual=out,
                                expected=target,
                                rtol=1e-02,
                                atol=1e-02)

        # Check that the pytorch native model is also returning the trained
        # value that was trained on IPU.
        out, _ = model(input, target)
        helpers.assert_allclose(actual=out,
                                expected=target,
                                rtol=1e-02,
                                atol=1e-02)

    return poptorch_model


def test_custom_loss():
    torch.manual_seed(42)

    class CustomLoss(torch.nn.Module):
        # Mean squared error scaled.
        def forward(self, x, target):
            loss = poptorch.identity_loss(x - target, reduction="none")
            loss = loss * loss * 5.0
            return poptorch.identity_loss(loss, reduction="mean")

    run_custom_loss_test(loss_fn=CustomLoss(),
                         input=torch.randn(10),
                         target=torch.randn(10))


def test_custom_loss_l1():
    torch.manual_seed(42)

    class CustomLoss(torch.nn.Module):
        # Mean squared error scaled.
        def forward(self, x, target):
            loss = torch.nn.functional.l1_loss(x, target)
            loss = loss * loss * 5.0
            return poptorch.identity_loss(loss, reduction="mean")

    run_custom_loss_test(loss_fn=CustomLoss(),
                         input=torch.randn(10),
                         target=torch.randn(10))


def test_custom_loss_nll():
    torch.manual_seed(42)

    class CustomLoss(torch.nn.Module):
        # Mean squared error scaled.
        def forward(self, x, target):
            loss = torch.nn.functional.nll_loss(x, target)
            loss = loss * 5.0
            return poptorch.identity_loss(loss, reduction="mean")

    base_model = torch.nn.Sequential(torch.nn.Linear(10, 10),
                                     torch.nn.LogSoftmax(dim=1))

    input = torch.randn(1, 10)
    target = torch.randint(0, 10, [1])

    out = base_model(input)

    model = run_custom_loss_test(loss_fn=CustomLoss(),
                                 base_model=base_model,
                                 input=input,
                                 target=target,
                                 test_output_vs_target=False)
    model.copyWeightsToHost()

    # Check that the pytorch native model is also returning the trained
    # value that was trained on IPU.
    out = base_model(input)

    assert torch.argmax(out, dim=1) == target


def test_two_custom_losses():
    torch.manual_seed(42)

    base_model = torch.nn.Sequential(torch.nn.Linear(10, 10),
                                     torch.nn.LogSoftmax(dim=1))

    class CustomLoss(torch.nn.Module):
        # Mean squared error scaled.
        def forward(self, x, target):
            loss = torch.nn.functional.nll_loss(x, target)
            loss2 = torch.nn.functional.nll_loss(x, target) * 5.0
            return loss + loss2

    error_msg = ("Multiple independent losses found in graph. "
                 "Graph must have one final loss. "
                 "Wrap final graph loss in poptorch.identity_loss.")
    with pytest.raises(poptorch.Error, match=error_msg):
        run_custom_loss_test(loss_fn=CustomLoss(), base_model=base_model)


def test_two_custom_losses_with_id_wrapper():
    torch.manual_seed(42)

    base_model = torch.nn.Sequential(torch.nn.Linear(10, 10),
                                     torch.nn.LogSoftmax(dim=1))

    class CustomLoss(torch.nn.Module):
        # Mean squared error scaled.
        def forward(self, x, target):
            loss = torch.nn.functional.nll_loss(x, target)
            loss2 = torch.nn.functional.nll_loss(x, target) * 5.0
            return poptorch.identity_loss(loss + loss2, reduction="mean")

    run_custom_loss_test(loss_fn=CustomLoss(),
                         base_model=base_model,
                         test_output_vs_target=False)


def test_no_loss():
    torch.manual_seed(42)

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.model = torch.nn.Sequential(torch.nn.Linear(10, 10),
                                             torch.nn.LogSoftmax(dim=1))

        # Mean squared error scaled.
        def forward(self, x, target):
            fwd = self.model(x)
            loss = fwd * 12
            loss2 = target + 1
            a = loss + loss2
            return fwd, a, loss

    model = Model()
    optimizer = optim.SGD(model.parameters(), lr=0.01)

    poptorch_model = poptorch.trainingModel(model, optimizer=optimizer)

    label = torch.randint(0, 10, [1])
    input = torch.randn(1, 10)

    error_msg = "Couldn't find a loss in graph"
    with pytest.raises(poptorch.Error, match=error_msg):
        _ = poptorch_model(input, label)


================================================
FILE: tests/custom_ops/CMakeLists.txt
================================================
# This compiles a shared object file for the cube operator.
# Run make in the custom_ops folder to build.
cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
project(custom_cube_op)

set(CMAKE_BUILD_TYPE Debug)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)

set(POPART_DIR CACHE PATH "Path to a Popart install")
set(POPLAR_DIR CACHE PATH "Path to a Poplar install")

if( NOT ${POPLAR_DIR} STREQUAL "")
  list(APPEND CMAKE_PREFIX_PATH ${POPLAR_DIR})
  if(NOT poplar_FOUND)
    find_package(poplar REQUIRED)
  endif()
else()
  # Check the package is not already in CMake's path
  find_package(poplar)
  if(NOT poplar_FOUND)
    message(FATAL_ERROR "You must provide a path to a Poplar install using -DPOPLAR_DIR=/path/to/popart/build/install")
  endif()
endif()
if( NOT EXISTS ${POPART_DIR} )
  # Check the package is not already in CMake's path
  find_package(popart COMPONENTS popart-only)
  if(NOT popart_FOUND)
    message(FATAL_ERROR "You must provide a path to a Popart build using -DPOPART_DIR=/path/to/popart/build")
  endif()
else()
  list(APPEND CMAKE_PREFIX_PATH ${POPART_DIR})
  if(NOT popart_FOUND)
    find_package(popart REQUIRED COMPONENTS popart-only)
  endif()
endif()

# All C++ code in this project will be compiled as C++14
set (CMAKE_CXX_STANDARD 14)

set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)

add_library(custom_cube_op SHARED "custom_cube_op.cpp")
target_link_libraries(custom_cube_op popart-only)

add_library(custom_leaky_relu_op SHARED "custom_leaky_relu_op.cpp")
target_link_libraries(custom_leaky_relu_op popart-only)

add_library(custom_add_scalar_op SHARED "custom_add_scalar_op.cpp")
target_link_libraries(custom_add_scalar_op popart-only)

add_library(custom_add_scalar_vec_op SHARED "custom_add_scalar_vec_op.cpp")
target_link_libraries(custom_add_scalar_vec_op popart-only)

add_library(custom_add_vec_scalar_mul_op SHARED
            "custom_add_vec_scalar_mul_op.cpp")
target_link_libraries(custom_add_vec_scalar_mul_op popart-only)

add_library(custom_reduce_op SHARED
            "custom_reduce_op.cpp")
target_link_libraries(custom_reduce_op popart-only)

add_library(custom_three_input_reduce_op SHARED
            "custom_three_input_reduce_op.cpp")
target_link_libraries(custom_three_input_reduce_op popart-only)

add_library(custom_many_attribute_op SHARED
            "custom_many_attribute_op.cpp")
target_link_libraries(custom_many_attribute_op popart-only)


================================================
FILE: tests/custom_ops/custom_add_scalar_op.cpp
================================================
// Copyright (c) 2021 Graphcore Ltd. All rights reserved.

// This tests the use of the int_64/float attributes

#include <popart/op.hpp>
#include <popart/op/identity.hpp>
#include <popart/operators.hpp>
#include <popart/opmanager.hpp>
#include <popart/popx/opx.hpp>
#include <popart/popx/opxmanager.hpp>

#include <popops/ElementWise.hpp>

// Use extern to avoid mangled names when importing to python
extern "C" {

namespace custom_operators {
const popart::OperatorIdentifier add_scalar_float = {
    "test.poptorch", "AddScalarFloat", 1, 1, 1}; // NOLINT
} // namespace custom_operators

class AddScalarFloatOp;
class AddScalarFloatOpx;

class AddScalarFloatOp : public popart::Op {
public:
  AddScalarFloatOp(const popart::OperatorIdentifier &_opid, float scalar,
                   const popart::Op::Settings &settings_)
      : popart::Op(_opid, settings_), _scalar(scalar) {}

  void setup() override { outInfo(0) = inInfo(0); }

  std::unique_ptr<popart::Op> clone() const final {
    return std::unique_ptr<popart::Op>(new AddScalarFloatOp(*this));
  }

  float getScalar() { return _scalar; }

  // An estimate of how valuable sub-graph matching will be
  float getSubgraphValue() const final { return getLowSubgraphValue(); }

private:
  float _scalar;
};

popart::OpCreator<AddScalarFloatOp> add_scalar_float_op(
    {{custom_operators::add_scalar_float, {}}},
    [](const popart::OpCreatorInfo &info) {
      float scalar = info.attributes.getAttribute<popart::Attributes::Float>(
          "scalar", 0.0f);

      return std::unique_ptr<popart::Op>(
          new AddScalarFloatOp(info.opid, scalar, info.settings));
    },
    true);

class AddScalarFloatOpx : public popart::popx::Opx {
public:
  AddScalarFloatOpx(popart::Op *op, popart::popx::Devicex *devicex)
      : popart::popx::Opx(op, devicex) {
    verifyOp<AddScalarFloatOp>(op, custom_operators::add_scalar_float);
    _scalar = dynamic_cast<AddScalarFloatOp *>(op)->getScalar();
  }

  void grow(poplar::program::Sequence &prog) const override {
    auto in_tensor = getInTensor(0);
    auto const_tensor = graph().addConstant(in_tensor.elementType(), {1},
                                            _scalar, "scale_factor");
    graph().setTileMapping(const_tensor, 0);

    auto out_tensor =
        popops::add(graph(), in_tensor, const_tensor, prog, debugContext());
    setOutTensor(0, out_tensor);
  }

private:
  float _scalar;
};

static popart::popx::OpxCreator<AddScalarFloatOpx>
    add_scalar_float_opx_creator(custom_operators::add_scalar_float);

namespace custom_operators {
const popart::OperatorIdentifier add_scalar_int = {
    "test.poptorch", "AddScalarInt", 1, 1, 1}; // NOLINT
} // namespace custom_operators

class AddScalarIntOp;
class AddScalarIntOpx;

class AddScalarIntOp : public popart::Op {
public:
  AddScalarIntOp(const popart::OperatorIdentifier &_opid, std::int64_t scalar,
                 const popart::Op::Settings &settings_)
      : popart::Op(_opid, settings_), _scalar(scalar) {}

  void setup() override { outInfo(0) = inInfo(0); }

  std::unique_ptr<popart::Op> clone() const final {
    return std::unique_ptr<popart::Op>(new AddScalarIntOp(*this));
  }

  std::int64_t getScalar() { return _scalar; }

  // An estimate of how valuable sub-graph matching will be
  float getSubgraphValue() const final { return getLowSubgraphValue(); }

private:
  std::int64_t _scalar;
};

popart::OpCreator<AddScalarIntOp> add_scalar_int_op(
    {{custom_operators::add_scalar_int, {}}},
    [](const popart::OpCreatorInfo &info) {
      auto scalar =
          info.attributes.getAttribute<popart::Attributes::Int>("scalar", 0);

      return std::unique_ptr<popart::Op>(
          new AddScalarIntOp(info.opid, scalar, info.settings));
    },
    true);

class AddScalarIntOpx : public popart::popx::Opx {
public:
  AddScalarIntOpx(popart::Op *op, popart::popx::Devicex *devicex)
      : popart::popx::Opx(op, devicex) {
    verifyOp<AddScalarIntOp>(op, custom_operators::add_scalar_int);
    _scalar = dynamic_cast<AddScalarIntOp *>(op)->getScalar();
  }

  void grow(poplar::program::Sequence &prog) const override {
    auto in_tensor = getInTensor(0);
    auto const_tensor = graph().addConstant(in_tensor.elementType(), {1},
                                            _scalar, "scale_factor");
    graph().setTileMapping(const_tensor, 0);

    auto out_tensor =
        popops::add(graph(), in_tensor, const_tensor, prog, debugContext());
    setOutTensor(0, out_tensor);
  }

private:
  int64_t _scalar;
};

static popart::popx::OpxCreator<AddScalarIntOpx>
    add_scalar_int_opx_creator(custom_operators::add_scalar_int);

} // extern "C"


================================================
FILE: tests/custom_ops/custom_add_scalar_vec_op.cpp
================================================
// Copyright (c) 2021 Graphcore Ltd. All rights reserved.

// This tests the use of the int_64/float list attributes

#include <vector>

#include <popart/op.hpp>
#include <popart/op/identity.hpp>
#include <popart/operators.hpp>
#include <popart/opmanager.hpp>
#include <popart/popx/opx.hpp>
#include <popart/popx/opxmanager.hpp>

#include <popops/ElementWise.hpp>

// Use extern to avoid mangled names when importing to python
extern "C" {

namespace custom_operators {
const popart::OperatorIdentifier add_scalar_vec_float = {
    "test.poptorch", "AddScalarVecFloat", 1, 1, 1}; // NOLINT
} // namespace custom_operators

class AddScalarVecFloatOp;
class AddScalarVecFloatOpx;

class AddScalarVecFloatOp : public popart::Op {
public:
  AddScalarVecFloatOp(const popart::OperatorIdentifier &_opid,
                      std::vector<float> vec,
                      const popart::Op::Settings &settings_)
      : popart::Op(_opid, settings_), _vec(std::move(vec)) {}

  void setup() override {
    if (inInfo(0).shape().size() != 1) {
      exit(1);
    }
    if (static_cast<std::size_t>(inInfo(0).shape()[0]) != _vec.size()) {
      exit(1);
    }
    outInfo(0) = inInfo(0);
  }

  std::unique_ptr<popart::Op> clone() const final {
    return std::unique_ptr<popart::Op>(new AddScalarVecFloatOp(*this));
  }

  const std::vector<float> &getVec() { return _vec; }

  // An estimate of how valuable sub-graph matching will be
  float getSubgraphValue() const final { return getLowSubgraphValue(); }

private:
  std::vector<float> _vec;
};

popart::OpCreator<AddScalarVecFloatOp> add_scalar_vec_float_op(
    {{custom_operators::add_scalar_vec_float, {}}},
    [](const popart::OpCreatorInfo &info) {
      std::vector<float> const vec =
          info.attributes.getAttribute<popart::Attributes::Floats>("vec");

      return std::unique_ptr<popart::Op>(
          new AddScalarVecFloatOp(info.opid, vec, info.settings));
    },
    true);

class AddScalarVecFloatOpx : public popart::popx::Opx {
public:
  AddScalarVecFloatOpx(popart::Op *op, popart::popx::Devicex *devicex)
      : popart::popx::Opx(op, devicex) {
    verifyOp<AddScalarVecFloatOp>(op, custom_operators::add_scalar_vec_float);
    _vec = dynamic_cast<AddScalarVecFloatOp *>(op)->getVec();
  }

  void grow(poplar::program::Sequence &prog) const override {
    auto in_tensor = getInTensor(0);
    auto const_tensor = graph().addConstant(
        poplar::FLOAT, {_vec.size()},
        poplar::ArrayRef<float>(_vec.data(), _vec.size()), "vec");
    graph().setTileMapping(const_tensor, 0);

    auto out_tensor =
        popops::add(graph(), in_tensor, const_tensor, prog, debugContext());
    setOutTensor(0, out_tensor);
  }

private:
  std::vector<float> _vec;
};

static popart::popx::OpxCreator<AddScalarVecFloatOpx>
    add_scalar_vec_float_opx_creator(custom_operators::add_scalar_vec_float);

namespace custom_operators {
const popart::OperatorIdentifier add_scalar_vec_int = {
    "test.poptorch", "AddScalarVecInt", 1, 1, 1}; // NOLINT
} // namespace custom_operators

class AddScalarVecIntOp;
class AddScalarVecIntOpx;

class AddScalarVecIntOp : public popart::Op {
public:
  AddScalarVecIntOp(const popart::OperatorIdentifier &_opid,
                    std::vector<int64_t> vec,
                    const popart::Op::Settings &settings_)
      : popart::Op(_opid, settings_), _vec(std::move(vec)) {}

  void setup() override {
    if (inInfo(0).shape().size() != 1) {
      exit(1);
    }
    if (static_cast<std::size_t>(inInfo(0).shape()[0]) != _vec.size()) {
      exit(1);
    }
    outInfo(0) = inInfo(0);
  }

  std::unique_ptr<popart::Op> clone() const final {
    return std::unique_ptr<popart::Op>(new AddScalarVecIntOp(*this));
  }

  const std::vector<int64_t> &getVec() { return _vec; }

  // An estimate of how valuable sub-graph matching will be
  float getSubgraphValue() const final { return getLowSubgraphValue(); }

private:
  std::vector<int64_t> _vec;
};

popart::OpCreator<AddScalarVecIntOp> add_scalar_vec_int_op(
    {{custom_operators::add_scalar_vec_int, {}}},
    [](const popart::OpCreatorInfo &info) {
      std::vector<int64_t> const vec =
          info.attributes.getAttribute<popart::Attributes::Ints>("vec");

      return std::unique_ptr<popart::Op>(
          new AddScalarVecIntOp(info.opid, vec, info.settings));
    },
    true);

class AddScalarVecIntOpx : public popart::popx::Opx {
public:
  AddScalarVecIntOpx(popart::Op *op, popart::popx::Devicex *devicex)
      : popart::popx::Opx(op, devicex) {
    verifyOp<AddScalarVecIntOp>(op, custom_operators::add_scalar_vec_int);
    _vec = dynamic_cast<AddScalarVecIntOp *>(op)->getVec();
  }

  void grow(poplar::program::Sequence &prog) const override {
    auto in_tensor = getInTensor(0);
    auto const_tensor = graph().addConstant(
        poplar::INT, {_vec.size()},
        poplar::ArrayRef<int64_t>(_vec.data(), _vec.size()), "vec");
    graph().setTileMapping(const_tensor, 0);

    auto out_tensor =
        popops::add(graph(), in_tensor, const_tensor, prog, debugContext());
    setOutTensor(0, out_tensor);
  }

private:
  std::vector<int64_t> _vec;
};

static popart::popx::OpxCreator<AddScalarVecIntOpx>
    add_scalar_vec_int_opx_creator(custom_operators::add_scalar_vec_int);

} // extern "C"


================================================
FILE: tests/custom_ops/custom_add_vec_scalar_mul_op.cpp
================================================
// Copyright (c) 2021 Graphcore Ltd. All rights reserved.

// This tests the use of the int_64/float attributes

#include <popart/op.hpp>
#include <popart/op/identity.hpp>
#include <popart/operators.hpp>
#include <popart/opmanager.hpp>
#include <popart/popx/opx.hpp>
#include <popart/popx/opxmanager.hpp>

#include <popops/ElementWise.hpp>

// Use extern to avoid mangled names when importing to python
extern "C" {

namespace custom_operators {
const popart::OperatorIdentifier add_vec_scalar_mul_float = {
    "test.poptorch", "AddVecScalarMulFloat", 1, 1, 1}; // NOLINT
} // namespace custom_operators

class AddVecScalarMulFloatOp;
class AddVecScalarMulFloatOpx;

// Add the vec and multiply by the scalar
class AddVecScalarMulFloatOp : public popart::Op {
public:
  AddVecScalarMulFloatOp(const popart::OperatorIdentifier &_opid, float scalar,
                         std::vector<float> vec,
                         const popart::Op::Settings &settings_)
      : popart::Op(_opid, settings_), _scalar(scalar), _vec(std::move(vec)) {}

  void setup() override {
    if (inInfo(0).shape().size() != 1) {
      exit(1);
    }
    if (static_cast<std::size_t>(inInfo(0).shape()[0]) != _vec.size()) {
      exit(1);
    }
    outInfo(0) = inInfo(0);
  }

  std::unique_ptr<popart::Op> clone() const final {
    return std::unique_ptr<popart::Op>(new AddVecScalarMulFloatOp(*this));
  }

  float getScalar() { return _scalar; }

  const std::vector<float> &getVec() { return _vec; }

  // An estimate of how valuable sub-graph matching will be
  float getSubgraphValue() const final { return getLowSubgraphValue(); }

private:
  float _scalar;
  std::vector<float> _vec;
};

popart::OpCreator<AddVecScalarMulFloatOp> add_vec_scalar_mul_float_op(
    {{custom_operators::add_vec_scalar_mul_float, {}}},
    [](const popart::OpCreatorInfo &info) {
      float const scalar =
          info.attributes.getAttribute<popart::Attributes::Float>("scalar",
                                                                  0.0f);
      std::vector<float> const vec =
          info.attributes.getAttribute<popart::Attributes::Floats>("vec");

      return std::unique_ptr<popart::Op>(
          new AddVecScalarMulFloatOp(info.opid, scalar, vec, info.settings));
    },
    true);

class AddVecScalarMulFloatOpx : public popart::popx::Opx {
public:
  AddVecScalarMulFloatOpx(popart::Op *op, popart::popx::Devicex *devicex)
      : popart::popx::Opx(op, devicex) {
    verifyOp<AddVecScalarMulFloatOp>(
        op, custom_operators::add_vec_scalar_mul_float);
    _scalar = dynamic_cast<AddVecScalarMulFloatOp *>(op)->getScalar();
    _vec = dynamic_cast<AddVecScalarMulFloatOp *>(op)->getVec();
  }

  void grow(poplar::program::Sequence &prog) const override {
    auto in_tensor = getInTensor(0);

    auto vec_tensor = graph().addConstant(
        poplar::FLOAT, {_vec.size()},
        poplar::ArrayRef<float>(_vec.data(), _vec.size()), "vec");
    graph().setTileMapping(vec_tensor, 0);

    auto added_tensor =
        popops::add(graph(), in_tensor, vec_tensor, prog, debugContext());

    auto scalar_tensor =
        graph().addConstant(poplar::FLOAT, {1}, _scalar, "scale_factor");
    graph().setTileMapping(scalar_tensor, 0);

    auto out_tensor =
        popops::mul(graph(), added_tensor, scalar_tensor, prog, debugContext());
    setOutTensor(0, out_tensor);
  }

private:
  float _scalar;
  std::vector<float> _vec;
};

static popart::popx::OpxCreator<AddVecScalarMulFloatOpx>
    add_scalar_float_opx_creator(custom_operators::add_vec_scalar_mul_float);

} // extern "C"


================================================
FILE: tests/custom_ops/custom_cube_op.cpp
================================================
// Copyright (c) 2020 Graphcore Ltd. All rights reserved.

// This file has been lifted directly from the PopART examples. See file there
// for usage. Modified to take in and return two tensors.

#include <memory>
#include <popart/builder.hpp>
#include <popart/devicemanager.hpp>

#include <popart/ir.hpp>

#include <popart/logging.hpp>
#include <popart/ndarraywrapper.hpp>
#include <popart/op.hpp>
#include <popart/op/l1.hpp>
#include <popart/opmanager.hpp>
#include <popart/opserialiser.hpp>
#include <popart/optimizer.hpp>
#include <popart/patterns/pattern.hpp>
#include <popart/popx/opx.hpp>
#include <popart/popx/opxmanager.hpp>
#include <popart/session.hpp>
#include <popart/shapeinference.hpp>
#include <popart/tensordata.hpp>
#include <popart/tensorinfo.hpp>
#include <popart/tensornames.hpp>

#include <poprand/RandomGen.hpp>

#include <popops/ElementWise.hpp>

#include <popart/names.hpp>
#include <popart/operators.hpp>

namespace {

// for C++11 compatibility, we don't use std::make_unique
template <typename T, typename... Args>
std::unique_ptr<T> make_unique(Args &&...args) {
  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
}

} // namespace

// Use extern to avoid mangled names when importing to python
extern "C" {

namespace CustomOperators {
const popart::OperatorIdentifier Cube = {
    "com.acme", "Cube", 1, {2, 2}}; // NOLINT
} // namespace CustomOperators
namespace CustomGradOperators {
const static popart::OperatorIdentifier CubeGrad = { // NOLINT
    "com.acme",
    "CubeGrad",
    1,
    {2, 2}};
} // namespace CustomGradOperators

// For training with a custom Op, four classes need to be implemented,
// one for each of:
// {forward, gradient} x {Op, Opx}.
//
// If only inference is required, then two classes need to be implemented:
// {forward} x {Op, Opx}.
//
// The Op is a poplar/hardware agnostic description of the computation.
// the Opx is the poplar implementation of the Op.
//
// We do training in this example, so the four classes implemented are:
//
class CubeOp;
class CubeGradOp;
class CubeOpx;
class CubeGradOpx;

// The forward Op

class CubeOp : public popart::Op {
public:
  CubeOp(const popart::OperatorIdentifier &_opid,
         const popart::Op::Settings &settings_)
      : popart::Op(_opid, settings_) {}

  // Configure the output popart Tensor
  void setup() override {
    outInfo(0) = inInfo(0);
    outInfo(1) = inInfo(1);
  }

  std::unique_ptr<Op> clone() const final { return make_unique<CubeOp>(*this); }
  std::vector<std::unique_ptr<Op>> getGradOps() override {
    std::vector<std::unique_ptr<Op>> upops;
    upops.emplace_back(make_unique<CubeGradOp>(*this));
    return upops;
  }

  // An estimate of how valuable sub-graph matching will be
  float getSubgraphValue() const final { return getLowSubgraphValue(); }
};

static popart::OpCreator<CubeOp> cubeOpCreator({{CustomOperators::Cube, {}}},
                                               true);

// The forward Opx (poplar implementation of the forward Op)

class CubeOpx : public popart::popx::Opx {
public:
  CubeOpx(popart::Op *op, popart::popx::Devicex *devicex)
      : popart::popx::Opx(op, devicex) {
    // Not strictly necessary, we check that op is castable to a CubeOp *.
    verifyOp<CubeOp>(op, CustomOperators::Cube);
  }

  void grow(poplar::program::Sequence &prog) const override {
    auto output = popops::map(
        graph(),
        popops::expr::Add(popops::expr::Mul(popops::expr::Mul(popops::expr::_1,
                                                              popops::expr::_1),
                                            popops::expr::_1),
                          popops::expr::_2),
        {getInTensor(0), getInTensor(1)}, prog, debugContext());

    setOutTensor(0, output);

    auto output2 = popops::map(
        graph(),
        popops::expr::Mul(popops::expr::Mul(popops::expr::_1, popops::expr::_1),
                          popops::expr::_1),
        {getInTensor(0)}, prog, debugContext());

    setOutTensor(1, output2);
  }
};

// The gradient Op
class CubeGradOp : public popart::Op {
public:
  explicit CubeGradOp(const popart::Op &fwdOp)
      : popart::Op(CustomGradOperators::CubeGrad, fwdOp.getSettings()) {}

  std::unique_ptr<Op> clone() const final {
    return make_unique<CubeGradOp>(*this);
  }

  // same comment as for CubeOp, for running shape/type inference "statically"
  void setup() override { outInfo(0) = inInfo(0); }

  // function describing the inputs and output(s) of CubeGradOp
  // The Gradient Op which we are implementing (CubeGradOp) has 2 inputs.
  // The input at index 0 is:
  // the gradient of the 0'th output Tensor of the CubeOp.
  // The input at index 1 is :
  // the 0'th output Tensor of the CubeOp.
  // Supposing the CubeOp has input Tensor T0 and output Tensor T1,
  //
  //  input at index 0 (T0)
  //         |
  //       CubeOp
  //         |
  //  output at index 0 (T1)
  //
  // Then the picture described by the map below looks like,
  //
  //
  //   input at index 0 (gradient of T1)
  //        |   input at index 1 (T1)
  //        |     |
  //        |     |
  //       CubeGradOp
  //           |
  //           |
  //  output at index 0 (gradient of T0)
  //
  const std::vector<popart::GradInOutMapper> &gradInputInfo() const override {
    static const std::vector<popart::GradInOutMapper> inInfo = {
        {0, 0, popart::GradOpInType::GradOut},
        {1, 1, popart::GradOpInType::Out}};
    return inInfo;
  }
  const std::map<int, int> &gradOutToNonGradIn() const override {
    static const std::map<int, int> outInfo = {{0, 0}};
    return outInfo;
  }

  // an estimate of how valuable sub-graph matching will be
  float getSubgraphValue() const final { return getLowSubgraphValue(); }
};

class CubeGradOpx : public popart::popx::Opx {
public:
  CubeGradOpx(popart::Op *op, popart::popx::Devicex *devicex)
      : popart::popx::Opx(op, devicex) {
    verifyOp<CubeGradOp>(op, CustomGradOperators::CubeGrad);
  }

  // Create the gradient poplar::Tensor, which is
  // 3 * input_to_cube**2 * gradient_of_cube_output
  void grow(poplar::program::Sequence &prog) const final {
    insert(
        outId(0),
        popops::map(graph(),
                    popops::expr::Mul(
                        popops::expr::Const(3),
                        popops::expr::Mul(popops::expr::Mul(popops::expr::_1,
                                                            popops::expr::_1),
                                          popops::expr::_2)),
                    {getInTensor(0), getInTensor(1)}, // FwdOut, GradOut
                    prog, debugContext()));
  }
};

static popart::popx::OpxCreator<CubeOpx> cubeOpxCreator(CustomOperators::Cube);
static popart::popx::OpxCreator<CubeGradOpx>
    cubeGradOpxCreator(CustomGradOperators::CubeGrad);
}

static popart::RegisterShapeInferenceFunction
    cubeOpShapeInference(CustomOperators::Cube,
                         [](auto &ctx) { ctx.outInfo(0) = ctx.inInfo(0); });


================================================
FILE: tests/custom_ops/custom_leaky_relu_op.cpp
================================================
// Copyright (c) 2021 Graphcore Ltd. All rights reserved.

// This file is based on the example in the PopART User Guide:
// https://docs.sourcevertex.net/files/popart-popart-user-guide-latest/custom_ops.html

#include <memory>

#include <popart/op.hpp>

#include <popart/opmanager.hpp>
#include <popart/opserialiser.hpp>
#include <popart/popx/opx.hpp>
#include <popart/popx/opxmanager.hpp>

#include <popops/ElementWise.hpp>

namespace {

// for C++11 compatibility, we don't use std::make_unique
template <typename T, typename... Args>
std::unique_ptr<T> makeUnique(Args &&...args) {
  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
}

} // namespace

// Use extern to avoid mangled names when importing to python
extern "C" {

namespace custom_operators {
const popart::OperatorIdentifier leaky_relu = {
    "com.acme", "LeakyRelu", 1, {1, 1}}; // NOLINT
} // namespace custom_operators

namespace custom_grad_operators {
const static popart::OperatorIdentifier LeakyReluGrad = { // NOLINT
    "com.acme",
    "LeakyReluGrad",
    1,
    {1, 1}};
} // namespace custom_grad_operators

class LeakyReluGradOp;

class LeakyReluOp : public popart::Op {
public:
  LeakyReluOp(const popart::OperatorIdentifier &_opid, float alpha_,
              const popart::Op::Settings &settings_)
      : popart::Op(_opid, settings_), _alpha(alpha_) {}

  std::unique_ptr<Op> clone() const final {
    return makeUnique<LeakyReluOp>(*this);
  }

  void setup() final { outInfo(0) = inInfo(0); }

  void appendAttributes(popart::OpSerialiserBase &os) const override {
    Op::appendAttributes(os);
    os.appendAttribute("alpha", getAlpha());
  }

  void appendOutlineAttributes(popart::OpSerialiserBase &os) const override {
    Op::appendOutlineAttributes(os);
    os.appendAttribute("alpha", getAlpha());
  }

  std::vector<std::unique_ptr<popart::Op>> getGradOps() override {
    std::vector<std::unique_ptr<Op>> upops;
    upops.emplace_back(makeUnique<LeakyReluGradOp>(*this));
    return upops;
  }

  float getSubgraphValue() const final { return getHighSubgraphValue(); }

  bool requiresRandomSeed() const override { return false; }

  // Attributes
  float getAlpha() const { return _alpha; }

private:
  float _alpha;
};

static popart::OpDefinition::DataTypes t = {popart::DataType::FLOAT16,
                                            popart::DataType::FLOAT};

static popart::OpDefinition
    leaky_relu_op_def({popart::OpDefinition::Inputs({{"input", t}}),
                       popart::OpDefinition::Outputs({{"output", t}}),
                       popart::OpDefinition::Attributes({{"alpha", {"*"}}})});

static popart::OpCreator<LeakyReluOp> leaky_relu_op_creator(
    popart::OpDefinitions({{custom_operators::leaky_relu, leaky_relu_op_def}}),
    [](const popart::OpCreatorInfo &info) {
      float alpha = info.attributes.getAttribute<popart::Attributes::Float>(
          "alpha", 1e-2f);

      // default epsilon is 10**(-2)
      return makeUnique<LeakyReluOp>(info.opid, alpha, info.settings);
    },
    true);

class LeakyReluOpx : public popart::popx::Opx {
public:
  LeakyReluOpx(popart::Op *op, popart::popx::Devicex *devicex)
      : popart::popx::Opx(op, devicex) {
    verifyOp<LeakyReluOp>(op, {custom_operators::leaky_relu});
  }

  void grow(poplar::program::Sequence &prog) const final {
    auto op = getOp<LeakyReluOp>();

    poplar::Tensor input = getInTensor(0);

    float alpha = op.getAlpha();

    // x < 0.0f ? alpha * x : x
    auto expression = popops::expr::Select(
        popops::expr::Mul(popops::expr::Const(alpha), popops::expr::_1),
        popops::expr::_1,
        popops::expr::Lt(popops::expr::_1, popops::expr::Const(0.0f)));

    popops::mapInPlace(graph(), expression, {input}, prog,
                       debugContext("LeakyRelu"), poplar::OptionFlags());

    setOutTensor(0, input);
  }
};

static popart::popx::OpxCreator<LeakyReluOpx>
    add_scalar_float_opx_creator(custom_operators::leaky_relu);

class LeakyReluGradOp : public popart::Op {
public:
  explicit LeakyReluGradOp(const LeakyReluOp &fwdOp)
      : popart::Op(custom_grad_operators::LeakyReluGrad, fwdOp.settings),
        _alpha(fwdOp.getAlpha()) {}

  std::unique_ptr<popart::Op> clone() const final {
    return std::make_unique<LeakyReluGradOp>(*this);
  }
  void setup() final { outInfo(0) = inInfo(0); };

  const std::vector<popart::GradInOutMapper> &gradInputInfo() const override {
    static const std::vector<popart::GradInOutMapper> in_info = {
        {0, 0, popart::GradOpInType::GradOut},
        {1, 0, popart::GradOpInType::In}};
    return in_info;
  }

  // The Grad Op has 1 output, which is the gradient of the only input
  const std::map<int, int> &gradOutToNonGradIn() const override {
    static const std::map<int, int> out_info = {{0, 0}};
    return out_info;
  }

  bool requiresRandomSeed() const override { return false; }

  // an estimate of how valuable sub-graph matching will be
  float getSubgraphValue() const final { return getHighSubgraphValue(); }

  float getAlpha() const { return _alpha; }

  // Implementation defined below
  void appendAttributes(popart::OpSerialiserBase &os) const override {
    Op::appendAttributes(os);
    os.appendAttribute("alpha", getAlpha());
  }

  // Implementation defined below
  void appendOutlineAttributes(popart::OpSerialiserBase &os) const override {
    Op::appendOutlineAttributes(os);
    os.appendAttribute("alpha", getAlpha());
  }

private:
  float _alpha;
};

class LeakyReluGradOpx : public popart::popx::Opx {
public:
  LeakyReluGradOpx(popart::Op *op, popart::popx::Devicex *devicex)
      : popart::popx::Opx(op, devicex) {
    verifyOp<LeakyReluGradOp>(op, {custom_grad_operators::LeakyReluGrad});
  }

  void grow(poplar::program::Sequence &prog) const final {
    auto op = getOp<LeakyReluGradOp>();

    poplar::Tensor grad = getInTensor(0);
    poplar::Tensor input = getInTensor(1);

    float alpha = op.getAlpha();

    // (grad * (x < 0.0f ? alpha : 1))
    auto expression = popops::expr::Mul(
        popops::expr::Select(
            popops::expr::Const(alpha), popops::expr::Const(1.0f),
            popops::expr::Lt(popops::expr::_2, popops::expr::Const(0.0f))),
        popops::expr::_1);

    auto output =
        popops::map(graph(), expression, {grad, input}, prog,
                    debugContext("LeakyReluGrad"), poplar::OptionFlags());

    setOutTensor(0, output);
  }
};

static popart::popx::OpxCreator<LeakyReluOpx>
    leaky_relu_opx_creator({custom_operators::leaky_relu});
static popart::popx::OpxCreator<LeakyReluGradOpx>
    leaky_relu_grad_opx_creator(custom_grad_operators::LeakyReluGrad);

} // extern "C"


================================================
FILE: tests/custom_ops/custom_many_attribute_op.cpp
================================================
// Copyright (c) 2021 Graphcore Ltd. All rights reserved.

// This tests the use of many attributes

#include <popart/op.hpp>
#include <popart/operators.hpp>
#include <popart/opmanager.hpp>
#include <popart/popx/opx.hpp>
#include <popart/popx/opxmanager.hpp>

#include <popops/ElementWise.hpp>

// Use extern to avoid mangled names when importing to python
extern "C" {

namespace custom_operators {
const popart::OperatorIdentifier many_attribute = {
    "test.poptorch", "ManyAttributeOp", 1, 1, 1}; // NOLINT
} // namespace custom_operators

class ManyAttributeOp;
class ManyAttributeOpx;

// Adds one if all attributes in the creator were correct, otherwise acts
// as an identity function
class ManyAttributeOp : public popart::Op {
public:
  ManyAttributeOp(const popart::OperatorIdentifier &_opid, bool all_passed,
                  const popart::Op::Settings &settings_)
      : popart::Op(_opid, settings_), _all_passed(all_passed) {}

  void setup() override { outInfo(0) = inInfo(0); }

  std::unique_ptr<popart::Op> clone() const final {
    return std::unique_ptr<popart::Op>(new ManyAttributeOp(*this));
  }

  bool allPassed() { return _all_passed; }

  // An estimate of how valuable sub-graph matching will be
  float getSubgraphValue() const final { return getLowSubgraphValue(); }

private:
  bool _all_passed;
};

popart::OpCreator<ManyAttributeOp> many_attribute_op(
    {{custom_operators::many_attribute, {}}},
    [](const popart::OpCreatorInfo &info) {
      bool correct = false;

      // Have 2 of each kind of attribute
      if (info.attributes.getAttribute<popart::Attributes::Float>(
              "float_one") == 1.0 &&
          info.attributes.getAttribute<popart::Attributes::Float>(
              "float_minus_two") == -2.0 &&
          info.attributes.getAttribute<popart::Attributes::Int>("int_zero") ==
              0 &&
          info.attributes.getAttribute<popart::Attributes::Int>(
              "int_minus_five") == -5 &&
          info.attributes.getAttribute<popart::Attributes::Floats>(
              "floats_one_two_three") == std::vector<float>{1.0, 2.0, 3.0} &&
          info.attributes.getAttribute<popart::Attributes::Floats>(
              "floats_minus_one_two_three") ==
              std::vector<float>{-1.0, -2.0, -3.0} &&
          info.attributes.getAttribute<popart::Attributes::Ints>(
              "ints_one_two_three") == std::vector<int64_t>{1, 2, 3} &&
          info.attributes.getAttribute<popart::Attributes::Ints>(
              "ints_minus_one_two_three") == std::vector<int64_t>{-1, -2, -3} &&
          info.attributes.getAttribute<popart::Attributes::String>(
              "a_string") == "string with quotes and slash \" ' \\ end" &&
          info.attributes.getAttribute<popart::Attributes::Strings>("strs") ==
              std::vector<std::string>{"\x01", "\x02", "\x03"}) {
        correct = true;
      }

      return std::unique_ptr<popart::Op>(
          new ManyAttributeOp(info.opid, correct, info.settings));
    },
    true);

class ManyAttributeOpx : public popart::popx::Opx {
public:
  ManyAttributeOpx(popart::Op *op, popart::popx::Devicex *devicex)
      : popart::popx::Opx(op, devicex) {
    verifyOp<ManyAttributeOp>(op, custom_operators::many_attribute);
    _all_passed = dynamic_cast<ManyAttributeOp *>(op)->allPassed();
  }

  void grow(poplar::program::Sequence &prog) const override {
    auto in_tensor = getInTensor(0);
    auto const_tensor =
        graph().addConstant(in_tensor.elementType(), {1}, 1, "one");
    graph().setTileMapping(const_tensor, 0);

    if (_all_passed) {
      auto out_tensor =
          popops::add(graph(), in_tensor, const_tensor, prog, debugContext());
      setOutTensor(0, out_tensor);
    } else {
      setOutTensor(0, in_tensor);
    }
  }

private:
  bool _all_passed;
};

static popart::popx::OpxCreator<ManyAttributeOpx>
    many_attributes_opx_creator(custom_operators::many_attribute);

} // extern "C"


================================================
FILE: tests/custom_ops/custom_reduce_op.cpp
================================================
// Copyright (c) 2021 Graphcore Ltd. All rights reserved.

// This tests the use of the string attribute

#include <popart/op.hpp>
#include <popart/op/identity.hpp>
#include <popart/operators.hpp>
#include <popart/opmanager.hpp>
#include <popart/popx/opx.hpp>
#include <popart/popx/opxmanager.hpp>

#include <popops/Reduce.hpp>

// Use extern to avoid mangled names when importing to python
extern "C" {

namespace custom_operators {
const popart::OperatorIdentifier reduce = {"test.poptorch", "ReduceOp", 1, 1,
                                           1}; // NOLINT
} // namespace custom_operators

class ReduceOp;
class ReduceOpx;

class ReduceOp : public popart::Op {
public:
  ReduceOp(const popart::OperatorIdentifier &_opid, std::string reduction,
           const popart::Op::Settings &settings_)
      : popart::Op(_opid, settings_), _reduction(std::move(reduction)) {}

  void setup() override {
    auto in_tensor = inInfo(0);
    popart::Shape out_shape({});
    outInfo(0).set(in_tensor.dataType(), out_shape);
  }

  std::unique_ptr<popart::Op> clone() const final {
    return std::unique_ptr<popart::Op>(new ReduceOp(*this));
  }

  std::string getReduction() { return _reduction; }

  // An estimate of how valuable sub-graph matching will be
  float getSubgraphValue() const final { return getLowSubgraphValue(); }

private:
  std::string _reduction;
};

popart::OpCreator<ReduceOp> reduce_op(
    {{custom_operators::reduce, {}}},
    [](const popart::OpCreatorInfo &info) {
      auto reduction = info.attributes.getAttribute<popart::Attributes::String>(
          "reduction", "mean");

      return std::unique_ptr<popart::Op>(
          new ReduceOp(info.opid, reduction, info.settings));
    },
    true);

class ReduceOpx : public popart::popx::Opx {
public:
  ReduceOpx(popart::Op *op, popart::popx::Devicex *devicex)
      : popart::popx::Opx(op, devicex) {
    verifyOp<ReduceOp>(op, custom_operators::reduce);

    if (dynamic_cast<ReduceOp *>(op)->getReduction() == "mean") {
      _mean = true;
    } else if (dynamic_cast<ReduceOp *>(op)->getReduction() == "sum") {
      _mean = false;
    } else {
      exit(1);
    }
  }

  void grow(poplar::program::Sequence &prog) const override {
    const poplar::Tensor &in_tensor(getInTensor(0));
    auto in_tensor_1_d = in_tensor.flatten();

    double scale = 1.0;
    if (_mean) {
      scale /= in_tensor_1_d.dim(0);
    }

    auto scale_tensor = graph().addConstant(poplar::FLOAT, {}, scale, "scale");
    graph().setTileMapping(scale_tensor, 0);
    auto out_tensor =
        popops::reduce(graph(), in_tensor_1_d, {0},
                       {popops::Operation::ADD, false, scale_tensor}, prog,
                       debugContext("reduce"));

    setOutTensor(0, out_tensor);
  }

private:
  // Mean if true, otherwise sum
  bool _mean;
};

static popart::popx::OpxCreator<ReduceOpx>
    reduce_opx_creator(custom_operators::reduce);

} // extern "C"


================================================
FILE: tests/custom_ops/custom_three_input_reduce_op.cpp
================================================
// Copyright (c) 2021 Graphcore Ltd. All rights reserved.

// This tests the use of the list of strings attribute

#include <popart/op.hpp>
#include <popart/operators.hpp>
#include <popart/opmanager.hpp>
#include <popart/popx/opx.hpp>
#include <popart/popx/opxmanager.hpp>

#include <popops/Reduce.hpp>

// Use extern to avoid mangled names when importing to python
extern "C" {

namespace custom_operators {
const popart::OperatorIdentifier three_reduce = {
    "test.poptorch", "ThreeReduceOp", 1, 3, 3}; // NOLINT
} // namespace custom_operators

class ThreeReduceOp;
class ThreeReduceOpx;

class ThreeReduceOp : public popart::Op {
public:
  ThreeReduceOp(const popart::OperatorIdentifier &_opid,
                std::vector<std::string> reductions,
                const popart::Op::Settings &settings_)
      : popart::Op(_opid, settings_), _reductions(std::move(reductions)) {}

  void setup() override {
    for (unsigned int i = 0; i < 3; i++) {
      auto in_tensor = inInfo(i);
      popart::Shape out_shape({});

      outInfo(i).set(in_tensor.dataType(), out_shape);
    }
  }

  std::unique_ptr<popart::Op> clone() const final {
    return std::unique_ptr<popart::Op>(new ThreeReduceOp(*this));
  }

  const std::vector<std::string> &getReductions() { return _reductions; }

  // An estimate of how valuable sub-graph matching will be
  float getSubgraphValue() const final { return getLowSubgraphValue(); }

private:
  const std::vector<std::string> _reductions;
};

popart::OpCreator<ThreeReduceOp> three_reduce_op(
    {{custom_operators::three_reduce, {}}},
    [](const popart::OpCreatorInfo &info) {
      auto reductions =
          info.attributes.getAttribute<popart::Attributes::Strings>(
              "reductions");

      return std::unique_ptr<popart::Op>(
          new ThreeReduceOp(info.opid, reductions, info.settings));
    },
    true);

class ThreeReduceOpx : public popart::popx::Opx {
public:
  ThreeReduceOpx(popart::Op *op, popart::popx::Devicex *devicex)
      : popart::popx::Opx(op, devicex) {
    verifyOp<ThreeReduceOp>(op, custom_operators::three_reduce);

    auto reductions = dynamic_cast<ThreeReduceOp *>(op)->getReductions();
    _mean.reserve(reductions.size());

    for (auto &reduction : reductions) {
      if (reduction == "mean") {
        _mean.emplace_back(true);
      } else {
        _mean.emplace_back(false);
        if (reduction != "sum") {
          exit(1);
        }
      }
    }
  }

  void grow(poplar::program::Sequence &prog) const override {
    for (unsigned int input_num = 0; input_num < 3; input_num++) {
      const poplar::Tensor &in_tensor(getInTensor(input_num));
      auto in_tensor_1_d = in_tensor.flatten();

      double scale = 1.0;
      if (_mean[input_num]) {
        scale /= in_tensor_1_d.dim(0);
      }

      auto scale_tensor =
          graph().addConstant(poplar::FLOAT, {}, scale, "scale");
      graph().setTileMapping(scale_tensor, 0);
      auto out_tensor =
          popops::reduce(graph(), in_tensor_1_d, {0},
                         {popops::Operation::ADD, false, scale_tensor}, prog,
                         debugContext("thee_reduce"));

      setOutTensor(input_num, out_tensor);
    }
  }

private:
  // Mean if true, otherwise sum
  std::vector<bool> _mean;
};

static popart::popx::OpxCreator<ThreeReduceOpx>
    reduce_opx_creator(custom_operators::three_reduce);

} // extern "C"


================================================
FILE: tests/custom_ops_attributes_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.

import collections
import ctypes
import pathlib
import random
import sys

import pytest
import torch
import helpers
import poptorch

myso = list(pathlib.Path("tests").rglob("libcustom_*.*"))
assert myso, "Failed to find libcustom_* libraries"
for single_so in myso:
    ctypes.cdll.LoadLibrary(single_so)


def test_float_attribute():
    class Model(torch.nn.Module):
        def forward(self, x):
            x = poptorch.custom_op([x],
                                   "AddScalarFloat",
                                   "test.poptorch",
                                   1,
                                   example_outputs=[x],
                                   attributes={"scalar": 3.5})
            return x

    model = Model()

    x = torch.tensor([5.0])

    inference_model = poptorch.inferenceModel(model)
    out = inference_model(x)
    expected = torch.tensor([8.5])

    helpers.assert_allclose(actual=out[0], expected=expected)


def test_float_attribute_too_low():
    class Model(torch.nn.Module):
        def forward(self, x):
            x = poptorch.custom_op([x],
                                   "AddScalarFloat",
                                   "test.poptorch",
                                   1,
                                   example_outputs=[x],
                                   attributes={"scalar": -sys.float_info.max})
            return x

    model = Model()

    x = torch.tensor([5.0])
    inference_model = poptorch.inferenceModel(model)

    with pytest.raises(poptorch.Error,
                       match=r"-1\.79769e\+308 is too low for a Popart float "
                       r"attribute\."):
        inference_model(x)


def test_float_attribute_too_high():
    class Model(torch.nn.Module):
        def forward(self, x):
            x = poptorch.custom_op([x],
                                   "AddScalarFloat",
                                   "test.poptorch",
                                   1,
                                   example_outputs=[x],
                                   attributes={"scalar": sys.float_info.max})
            return x

    model = Model()

    x = torch.tensor([5.0])
    inference_model = poptorch.inferenceModel(model)

    with pytest.raises(poptorch.Error,
                       match=r"1\.79769e\+308 is too high for a Popart float "
                       r"attribute\."):
        inference_model(x)


def test_int_attribute():
    class Model(torch.nn.Module):
        def forward(self, x):
            x = poptorch.custom_op([x],
                                   "AddScalarInt",
                                   "test.poptorch",
                                   1,
                                   example_outputs=[x],
                                   attributes={"scalar": 3})
            return x

    model = Model()

    x = torch.tensor([5])

    inference_model = poptorch.inferenceModel(model)
    out = inference_model(x)

    helpers.assert_allequal(actual=out[0],
                            expected=torch.tensor([8], dtype=torch.int32))


def test_float_list_attribute():
    class Model(torch.nn.Module):
        def forward(self, x):
            x = poptorch.custom_op([x],
                                   "AddScalarVecFloat",
                                   "test.poptorch",
                                   1,
                                   example_outputs=[x],
                                   attributes={"vec": [1.0, 2.0, 3.0]})
            return x

    model = Model()

    x = torch.tensor([3.0, 4.0, 5.0])

    inference_model = poptorch.inferenceModel(model)
    out = inference_model(x)

    helpers.assert_allclose(actual=out[0],
                            expected=torch.tensor([4.0, 6.0, 8.0]))


def test_float_list_attribute_too_low():
    class Model(torch.nn.Module):
        def forward(self, x):
            x = poptorch.custom_op(
                [x],
                "AddScalarVecFloat",
                "test.poptorch",
                1,
                example_outputs=[x],
                attributes={"vec": [1.0, 2.0, -sys.float_info.max]})
            return x

    model = Model()

    x = torch.tensor([3.0, 4.0, 5.0])

    inference_model = poptorch.inferenceModel(model)
    with pytest.raises(poptorch.Error,
                       match=r"-1\.79769e\+308 is too low for a Popart float "
                       r"attribute\."):
        inference_model(x)


def test_float_list_attribute_too_high():
    class Model(torch.nn.Module):
        def forward(self, x):
            x = poptorch.custom_op(
                [x],
                "AddScalarVecFloat",
                "test.poptorch",
                1,
                example_outputs=[x],
                attributes={"vec": [sys.float_info.max, 2.0, 3.0]})
            return x

    model = Model()

    x = torch.tensor([3.0, 4.0, 5.0])

    inference_model = poptorch.inferenceModel(model)
    with pytest.raises(poptorch.Error,
                       match=r"1\.79769e\+308 is too high for a Popart float "
                       r"attribute\."):
        inference_model(x)


def test_float_tuple_attribute():
    class Model(torch.nn.Module):
        def forward(self, x):
            x = poptorch.custom_op([x],
                                   "AddScalarVecFloat",
                                   "test.poptorch",
                                   1,
                                   example_outputs=[x],
                                   attributes={"vec": (1.0, 2.0, 3.0)})
            return x

    model = Model()

    x = torch.tensor([3.0, 4.0, 5.0])

    inference_model = poptorch.inferenceModel(model)
    out = inference_model(x)

    helpers.assert_allclose(expected=out[0],
                            actual=torch.tensor([4.0, 6.0, 8.0]))


def test_int_list_attribute():
    class Model(torch.nn.Module):
        def forward(self, x):
            x = poptorch.custom_op([x],
                                   "AddScalarVecInt",
                                   "test.poptorch",
                                   1,
                                   example_outputs=[x],
                                   attributes={"vec": [1, 2, 3]})
            return x

    model = Model()

    x = torch.tensor([3, 4, 5])

    inference_model = poptorch.inferenceModel(model)
    out = inference_model(x)

    helpers.assert_allequal(actual=out[0],
                            expected=torch.tensor([4, 6, 8],
                                                  dtype=torch.int32))


def test_float_combined_attributes():
    class Model(torch.nn.Module):
        def forward(self, x):
            x = poptorch.custom_op([x],
                                   "AddVecScalarMulFloat",
                                   "test.poptorch",
                                   1,
                                   example_outputs=[x],
                                   attributes={
                                       "vec": [1.0, 2.0, 3.0],
                                       "scalar": 2.0
                                   })
            return x

    model = Model()

    x = torch.tensor([3.0, 4.0, 5.0])

    inference_model = poptorch.inferenceModel(model)
    out = inference_model(x)

    helpers.assert_allequal(actual=out[0],
                            expected=torch.tensor([8.0, 12.0, 16.0]))


def test_int_two_attributes():
    class Model(torch.nn.Module):
        def forward(self, x):
            x = poptorch.custom_op([x],
                                   "AddScalarInt",
                                   "test.poptorch",
                                   1,
                                   example_outputs=[x],
                                   attributes={"scalar": 3})
            x = poptorch.custom_op(x,
                                   "AddScalarInt",
                                   "test.poptorch",
                                   1,
                                   example_outputs=x,
                                   attributes={"scalar": 2})
            return x

    model = Model()

    x = torch.tensor([5])

    inference_model = poptorch.inferenceModel(model)
    out = inference_model(x)

    helpers.assert_allequal(actual=out[0],
                            expected=torch.tensor([10], dtype=torch.int32))


@pytest.mark.parametrize("attr", ("sum", "mean"))
def test_string_attribute(attr):
    class Model(torch.nn.Module):
        def forward(self, x):
            x = poptorch.custom_op([x],
                                   "ReduceOp",
                                   "test.poptorch",
                                   1,
                                   example_outputs=[x],
                                   attributes={"reduction": attr})
            return x

    model = Model()

    x = torch.tensor([5.0, 6.0, 7.0])

    inference_model = poptorch.inferenceModel(model)
    out = inference_model(x)

    if attr == "mean":
        helpers.assert_allclose(actual=out[0], expected=torch.tensor(6.0))
    else:
        helpers.assert_allclose(actual=out[0], expected=torch.tensor(18.0))


def test_non_ascii_string_attribute():
    class Model(torch.nn.Module):
        def forward(self, x):
            x = poptorch.custom_op([x],
                                   "ReduceOp",
                                   "test.poptorch",
                                   1,
                                   example_outputs=[x],
                                   attributes={"reduction": "a\u1f00b"})
            return x

    model = Model()

    x = torch.tensor([5.0, 6.0, 7.0])

    inference_model = poptorch.inferenceModel(model)

    with pytest.raises(ValueError,
                       match="a\u1f00b contains non-ASCII characters."):
        inference_model(x)


def test_string_list_attribute():
    class Model(torch.nn.Module):
        def forward(self, x, y, z):
            x = poptorch.custom_op(
                [x, y, z],
                "ThreeReduceOp",
                "test.poptorch",
                1,
                example_outputs=[x, y, z],
                attributes={"reductions": ["mean", "sum", "mean"]})
            return x

    model = Model()

    x = torch.tensor([1.0, 2.0, 3.0])
    y = torch.tensor([2.0, 3.0, 4.0])
    z = torch.tensor([3.0, 4.0, 5.0])

    inference_model = poptorch.inferenceModel(model)
    out = inference_model(x, y, z)

    helpers.assert_allequal(actual=out[0], expected=torch.tensor(2.0))
    helpers.assert_allequal(actual=out[1], expected=torch.tensor(9.0))
    helpers.assert_allequal(actual=out[2], expected=torch.tensor(4.0))


def test_non_asciistring_list_attribute():
    class Model(torch.nn.Module):
        def forward(self, x, y, z):
            x = poptorch.custom_op(
                [x, y, z],
                "ThreeReduceOp",
                "test.poptorch",
                1,
                example_outputs=[x, y, z],
                attributes={"reductions": ["a\u1f00b", "sum", "mean"]})
            return x

    model = Model()

    x = torch.tensor([1.0, 2.0, 3.0])
    y = torch.tensor([2.0, 3.0, 4.0])
    z = torch.tensor([3.0, 4.0, 5.0])

    inference_model = poptorch.inferenceModel(model)
    with pytest.raises(ValueError,
                       match="a\u1f00b contains non-ASCII characters."):
        inference_model(x, y, z)


ALL_ATTRIBUTES = {
    "float_one": 1.0,
    "float_minus_two": -2.0,
    "int_zero": 0,
    "int_minus_five": -5,
    "floats_one_two_three": [1.0, 2.0, 3.0],
    "floats_minus_one_two_three": [-1.0, -2.0, -3.0],
    "ints_one_two_three": [1, 2, 3],
    "ints_minus_one_two_three": [-1, -2, -3],
    "a_string": "string with quotes and slash \" ' \\ end",
    "strs": ["\x01", "\x02", "\x03"]
}


@pytest.mark.parametrize("seed", range(10))
def test_many_attributes(seed):
    attr_keys = list(ALL_ATTRIBUTES.keys())
    random.seed(seed)
    random.shuffle(attr_keys)
    attrs_shuff = collections.OrderedDict()

    for key in attr_keys:
        attrs_shuff[key] = ALL_ATTRIBUTES[key]

    class Model(torch.nn.Module):
        def forward(self, x):
            x = poptorch.custom_op([x],
                                   "ManyAttributeOp",
                                   "test.poptorch",
                                   1,
                                   example_outputs=[x],
                                   attributes=attrs_shuff)
            return x

    model = Model()

    x = torch.tensor([0.0])

    inference_model = poptorch.inferenceModel(model)
    out = inference_model(x)

    helpers.assert_allequal(actual=out[0],
                            expected=torch.tensor(1.0).reshape((1, )))


@pytest.mark.parametrize("seed", range(3))
def test_many_attributes_one_wrong(seed):
    attr_keys = list(ALL_ATTRIBUTES.keys())
    random.seed(seed)
    random.shuffle(attr_keys)
    attrs_shuff = collections.OrderedDict()

    for key in attr_keys:
        attrs_shuff[key] = ALL_ATTRIBUTES[key]
    attrs_shuff["a_string"] = "Very wrong"

    class Model(torch.nn.Module):
        def forward(self, x):
            x = poptorch.custom_op([x],
                                   "ManyAttributeOp",
                                   "test.poptorch",
                                   1,
                                   example_outputs=[x],
                                   attributes=attrs_shuff)
            return x

    model = Model()

    x = torch.tensor([0.0])

    inference_model = poptorch.inferenceModel(model)
    out = inference_model(x)

    helpers.assert_allequal(actual=out[0],
                            expected=torch.tensor(0.0).reshape((1, )))


#many_attribtes_examples_start
def test_many_attributes_examples():
    class Model(torch.nn.Module):
        def forward(self, x):
            attributes = {
                "float_one": 1.0,
                "float_minus_two": -2.0,
                "int_zero": 0,
                "int_minus_five": -5,
                "floats_one_two_three": [1.0, 2.0, 3.0],
                "floats_minus_one_two_three": [-1.0, -2.0, -3.0],
                "ints_one_two_three": [1, 2, 3],
                "ints_minus_one_two_three": [-1, -2, -3],
                "a_string": "string with quotes and slash \" ' \\ end",
                "strs": ["abc", "def", "ghi"]
            }

            x = poptorch.custom_op([x],
                                   "ManyAttributeOp",
                                   "test.poptorch",
                                   1,
                                   example_outputs=[x],
                                   attributes=attributes)
            #many_attribtes_examples_end
            return x

    model = Model()

    x = torch.tensor([0.0])

    inference_model = poptorch.inferenceModel(model)
    inference_model(x)


================================================
FILE: tests/custom_ops_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import ctypes
import pathlib

import torch
import torch.nn as nn
import helpers
import poptorch

#loading_library_start
myso = list(pathlib.Path("tests").rglob("libcustom_cube_op.*"))
assert myso, "Failed to find libcustom_cube_op"
myop = ctypes.cdll.LoadLibrary(myso[0])

#loading_library_end

myso = list(pathlib.Path("tests").rglob("libcustom_leaky_relu_op.*"))
assert myso, "Failed to find libcustom_leaky_relu_op"
myop = ctypes.cdll.LoadLibrary(myso[0])


#inference_start
def test_inference():
    class BasicNetwork(nn.Module):
        def forward(self, x, bias):
            x, y = poptorch.custom_op([x, bias],
                                      "Cube",
                                      "com.acme",
                                      1,
                                      example_outputs=[x, x])
            return x, y

    #inference_end

    model = BasicNetwork()

    x = torch.full((1, 8), 2.0)
    bias = torch.full((1, 8), 4.0)

    inference_model = poptorch.inferenceModel(model)
    out = inference_model(x, bias)

    expected = (torch.full((1, 8), 12.0), torch.full((1, 8), 8.0))
    helpers.assert_allclose(actual=out[0], expected=expected[0])
    helpers.assert_allclose(actual=out[1], expected=expected[1])


def test_training():
    def custom_loss(model_out, labels):
        l1 = torch.nn.functional.nll_loss(model_out[0], labels)
        # Popart errors if this is unused.
        l2 = torch.sum(model_out[1]) * 0.0001

        return l1 + l2

    class TrainingNetwork(nn.Module):
        def __init__(self):
            super().__init__()
            self.ln = torch.nn.Linear(100, 100)
            self.softmax = nn.Softmax(1)

        def forward(self, t, target):
            x = t[0]
            bias = t[1]
            x, y = poptorch.custom_op([x, bias],
                                      "Cube",
                                      "com.acme",
                                      1,
                                      example_outputs=[x, x])
            x = self.ln(x)
            x = self.softmax(x)

            out = (x, y)
            loss = custom_loss(out, target)
            return out, loss

    model = TrainingNetwork()

    x = torch.rand((1, 100))
    bias = torch.full((1, 100), 2.0)

    y = torch.full([1], 42, dtype=torch.long)

    poptorch_model = poptorch.trainingModel(model)

    for _ in range(0, 100):
        x = torch.rand((1, 100))
        out, _ = poptorch_model((x, bias), y)

    assert torch.argmax(out[0]) == 42


# Check that the custom op not only trains but also propagates the gradient backwards.
def test_training_both_sides():
    def custom_loss(model_out, labels):
        l1 = torch.nn.functional.nll_loss(model_out[0], labels)
        # Popart errors if this is unused.
        l2 = torch.sum(model_out[1]) * 0.0001
        return l1 + l2

    class TrainingNetwork(nn.Module):
        def __init__(self):
            super().__init__()
            self.ln1 = torch.nn.Linear(100, 100)
            self.ln2 = torch.nn.Linear(100, 100)
            self.softmax = nn.Softmax(1)

        def forward(self, t, target):
            x = self.ln1(t[0])
            bias = t[1]
            x, y = poptorch.custom_op([x, bias],
                                      "Cube",
                                      "com.acme",
                                      1,
                                      example_outputs=[x, x])
            x = self.ln2(x)
            x = self.softmax(x)

            out = (x, y)

            loss = custom_loss(out, target)
            return out, loss

    model = TrainingNetwork()

    x = torch.rand((1, 100))
    bias = torch.full((1, 100), 2.0)

    y = torch.full([1], 42, dtype=torch.long)

    weights_before = model.ln1.weight.clone()

    poptorch_model = poptorch.trainingModel(model)

    for _ in range(0, 100):
        x = torch.rand((1, 100))
        out, _ = poptorch_model((x, bias), y)

    assert not torch.allclose(weights_before, model.ln1.weight)

    assert torch.argmax(out[0]) == 42


def test_inference_with_an_attribute():
    #inference_with_attribute_start
    class Model(torch.nn.Module):
        def forward(self, x):
            x = poptorch.custom_op([x],
                                   "LeakyRelu",
                                   "com.acme",
                                   1,
                                   example_outputs=[x],
                                   attributes={"alpha": 0.02})
            return x[0]

    #inference_with_attribute_end

    model = Model()

    x = torch.tensor([-1.0, -0.5, 0.0, 0.5, 1.0])

    inference_model = poptorch.inferenceModel(model)
    out = inference_model(x)

    helpers.assert_allclose(actual=out,
                            expected=torch.tensor(
                                [-0.02, -0.01, 0.0, 0.5, 1.0]))


================================================
FILE: tests/dataloader_test.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import itertools
import functools
import math
import random
import time
import subprocess
import marshal
import re
import os
import sys
import signal
import torch
import pytest
import numpy
import helpers
import poptorch


class BrokenDataset(torch.utils.data.Dataset):
    def __init__(self, length):
        super().__init__()
        self._length = length

    def __len__(self):
        return self._length

    def __getitem__(self, index):
        assert False, "Broken dataset"


class IncrementDataset(torch.utils.data.Dataset):
    def __init__(self, shape, length, dtype=torch.float32):
        super().__init__()
        self._shape = shape
        self._length = length
        self._dtype = dtype

    def __len__(self):
        return self._length

    def __getitem__(self, index):
        if index >= self._length:
            raise StopIteration
        return torch.full(self._shape, index, dtype=self._dtype)


class IncrementIterableDataset(torch.utils.data.IterableDataset):
    def __init__(self, shape, length, start=0, dtype=torch.float32):
        super().__init__()
        self._shape = shape
        self.length = length
        self.start = start
        self._dtype = dtype

    def __iter__(self):
        for index in range(self.length):
            yield torch.full(self._shape,
                             self.start + index,
                             dtype=self._dtype)

    def __getitem__(self, index):
        raise NotImplementedError("No __getitem__ for iterable datasets")


class IncrementIterableDatasetWithLen(IncrementIterableDataset):
    def __len__(self):
        return self.length

    def __getitem__(self, index):
        raise NotImplementedError("No __getitem__ for iterable datasets")


class IncrementDatasetWithLabels(torch.utils.data.Dataset):
    def __init__(self, shape, length):
        super().__init__()
        self._shape = shape
        self._length = length

    def __len__(self):
        return self._length

    def __getitem__(self, index):
        return (torch.full(self._shape, index, dtype=torch.float32),
                torch.full((1, ), index, dtype=torch.long))


class IncrementDatasetWithLabelsDict(torch.utils.data.Dataset):
    def __init__(self, shape, length):
        super().__init__()
        self._shape = shape
        self._length = length

    def __len__(self):
        return self._length

    def __getitem__(self, index):
        return {
            "data": torch.full(self._shape, index, dtype=torch.float32),
            "label": torch.full((1, ), index, dtype=torch.long)
        }


class CheckOrderModel(torch.nn.Module):
    def forward(self, data, expected):
        # return expected + 1 if data was what we expected
        return torch.sum(data - expected)


class DoubleData(torch.nn.Module):
    def forward(self, data):
        return data * 2


class DoubleDataLabel(torch.nn.Module):
    def forward(self, data, label):
        return data * 2, label * 2


def _run_test(shape=None,
              num_tensors=100,
              batch_size=1,
              num_workers=0,
              device_iterations=1,
              replication_factor=1):
    shape = shape or [2, 3]

    opts = poptorch.Options()
    opts.deviceIterations(device_iterations)
    opts.replicationFactor(replication_factor)

    data = poptorch.DataLoader(opts,
                               IncrementDataset(shape, num_tensors),
                               batch_size=batch_size,
                               num_workers=num_workers)

    assert len(data) == num_tensors // (device_iterations * batch_size *
                                        replication_factor)
    model = poptorch.inferenceModel(CheckOrderModel(), opts)
    for it, d in enumerate(data):
        expected = torch.from_numpy(
            numpy.stack([
                numpy.full(shape, i, dtype=numpy.float32)
                for i in range(data.combinedBatchSize *
                               it, data.combinedBatchSize * (it + 1))
            ]))
        diff = torch.sum(model(d, expected))

    numpy.testing.assert_array_equal(diff.numpy(), [0.])


def test_simple():
    _run_test()


def test_batch():
    _run_test(batch_size=4)


def test_workers():
    _run_test(num_workers=8)


def test_device_iterations():
    _run_test(device_iterations=4)


@pytest.mark.ipuHardwareRequired
def test_replica():
    _run_test(replication_factor=4)


@pytest.mark.ipuHardwareRequired
def test_combined():
    _run_test(batch_size=2,
              device_iterations=5,
              replication_factor=2,
              num_workers=4)


def _run_process_test(shape=None,
                      num_tensors=100,
                      batch_size=1,
                      num_workers=0,
                      device_iterations=1,
                      replication_factor=1,
                      num_runs=1):
    shape = shape or [2, 3]

    opts = poptorch.Options()
    opts.deviceIterations(device_iterations)
    opts.replicationFactor(replication_factor)

    loader = poptorch.DataLoader(opts,
                                 IncrementDataset(shape, num_tensors),
                                 batch_size=batch_size,
                                 num_workers=num_workers,
                                 mode=poptorch.DataLoaderMode.Async)

    assert len(loader) == num_tensors // (device_iterations * batch_size *
                                          replication_factor)

    model = poptorch.inferenceModel(DoubleData(), opts)

    for _ in range(0, num_runs):
        for it, d in enumerate(loader):
            out = model(d)

            expected = torch.stack([
                torch.full(shape, i * 2, dtype=torch.float32)
                for i in range(loader.combinedBatchSize *
                               it, loader.combinedBatchSize * (it + 1))
            ])

            helpers.assert_allequal(actual=out, expected=expected)


def test_multithreaded1():
    _run_process_test(num_tensors=100,
                      batch_size=2,
                      device_iterations=1,
                      replication_factor=1,
                      num_workers=0)


def test_multithreaded2():
    _run_process_test(num_tensors=100,
                      batch_size=2,
                      device_iterations=10,
                      replication_factor=1,
                      num_workers=0)


@pytest.mark.ipuHardwareRequired
def test_multithreaded3():
    _run_process_test(num_tensors=10,
                      batch_size=2,
                      device_iterations=1,
                      replication_factor=4,
                      num_workers=0)


def _run_process_label_test(shape=None,
                            num_tensors=100,
                            batch_size=1,
                            num_workers=0,
                            device_iterations=1,
                            replication_factor=1):
    shape = shape or [2, 3]

    opts = poptorch.Options()
    opts.deviceIterations(device_iterations)
    opts.replicationFactor(replication_factor)

    loader = poptorch.DataLoader(opts,
                                 IncrementDatasetWithLabels(
                                     shape, num_tensors),
                                 batch_size=batch_size,
                                 num_workers=num_workers,
                                 mode=poptorch.DataLoaderMode.Async)

    assert len(loader) == num_tensors // (device_iterations * batch_size *
                                          replication_factor)

    model = poptorch.inferenceModel(DoubleDataLabel(), opts)

    total = torch.zeros(shape)
    label_out = torch.zeros(1, dtype=torch.int)
    for (data, label) in loader:
        out, label = model(data, label)
        total += torch.sum(out, dim=0)
        label_out += torch.sum(label, dim=0)

    expected = 0
    for i in range(0, num_tensors):
        expected += i * 2

    numpy.testing.assert_array_equal(total[0][0].numpy(), [expected])
    numpy.testing.assert_array_equal(label_out[0].item(), [expected])


def test_multithreaded4():
    _run_process_label_test(num_tensors=60,
                            batch_size=2,
                            device_iterations=10,
                            replication_factor=1,
                            num_workers=0)


def _run_subdataset_test(num_tensors=100,
                         batch_size=1,
                         num_workers=0,
                         device_iterations=1,
                         replication_factor=1,
                         num_hosts=1):
    shape = [2, 3]
    dataset = IncrementDataset(shape, num_tensors)

    combined_batch_size = 0
    next_expected = 0
    for host_id in range(num_hosts):
        opts = poptorch.Options()
        opts.deviceIterations(device_iterations)
        opts.replicationFactor(replication_factor)
        opts.Distributed.configureProcessId(host_id, num_hosts)

        loader = poptorch.DataLoader(opts,
                                     dataset,
                                     batch_size=batch_size,
                                     num_workers=num_workers,
                                     mode=poptorch.DataLoaderMode.Async)

        combined_batch_size = loader.combinedBatchSize
        assert combined_batch_size == (device_iterations * batch_size *
                                       replication_factor)
        assert len(loader) == num_tensors // (combined_batch_size * num_hosts)
        for d in loader:
            for elt in d:
                val = int(elt[0][0].item())
                assert val == next_expected
                next_expected += 1

    # Number of processes shouldn't change how many tensors are returned
    num_expected = num_hosts * combined_batch_size * (
        num_tensors // (combined_batch_size * num_hosts))
    assert next_expected == num_expected


def _run_shuffle_subdataset_test(num_tensors=100,
                                 batch_size=1,
                                 num_workers=0,
                                 device_iterations=1,
                                 replication_factor=1,
                                 num_hosts=1):
    shape = [2, 3]
    dataset = IncrementDataset(shape, num_tensors)

    total = [False] * num_tensors
    for host_id in range(num_hosts):
        seen = [False] * num_tensors
        opts = poptorch.Options()
        opts.deviceIterations(device_iterations)
        opts.replicationFactor(replication_factor)
        opts.randomSeed(42)
        opts.Distributed.configureProcessId(host_id, num_hosts)

        loader = poptorch.DataLoader(opts,
                                     dataset,
                                     batch_size=batch_size,
                                     shuffle=True,
                                     num_workers=num_workers,
                                     mode=poptorch.DataLoaderMode.Async)

        combined_batch_size = loader.combinedBatchSize
        assert combined_batch_size == (device_iterations * batch_size *
                                       replication_factor)
        assert len(loader) == num_tensors // (combined_batch_size * num_hosts)
        for d in loader:
            for elt in d:
                val = int(elt[0][0].item())
                assert not seen[val]
                seen[val] = True
                total[val] = True
        assert seen.count(
            True) == combined_batch_size * (num_tensors //
                                            (combined_batch_size * num_hosts))

        # Iterate a second time to make sure the left over tensors get used too.
        for d in loader:
            for elt in d:
                val = int(elt[0][0].item())
                total[val] = True

    # If we iterate twice in all the processes then all the tensors should be used.
    assert total.count(True) == num_tensors


def test_subdataset():
    _run_subdataset_test(batch_size=4, num_hosts=3)


def test_subdataset2():
    _run_subdataset_test(batch_size=2, num_hosts=2, num_workers=3)


def test_shuffle_subdataset():
    _run_shuffle_subdataset_test(batch_size=4, num_hosts=3)


def test_shuffle_subdataset2():
    _run_shuffle_subdataset_test(batch_size=2, num_hosts=2, num_workers=3)


@pytest.mark.parametrize("num_processes", [2, 3, 4, 5])
@pytest.mark.parametrize("num_workers", [0, 1, 3])
def test_global_shuffle_each_epoch(num_processes, num_workers):
    each_process_data = []
    for process_id in range(num_processes):
        each_process_data.append(list())
        opts = poptorch.Options()
        opts.randomSeed(0)
        opts.Distributed.configureProcessId(process_id, num_processes)
        dataloader = poptorch.DataLoader(
            opts,
            IncrementDataset((), 100),
            batch_size=16,
            shuffle=True,
            num_workers=num_workers,
        )
        for _ in range(5):
            each_epoch_data = []
            for batch in dataloader:
                each_epoch_data += batch.tolist()
            each_process_data[process_id].append(sorted(each_epoch_data))

    # Make sure data between epochs differs within the same process
    # for all processes.
    for process_data in each_process_data:
        for epoch_data_i, epoch_data_j in itertools.combinations(
                process_data, 2):
            assert epoch_data_i != epoch_data_j


def test_interrupt_async_loader():
    """Make sure the worker processes are stopped cleanly even when the end of
    the dataset is not reached."""

    shape = [2, 3]
    num_tensors = 100

    opts = poptorch.Options()
    loader = poptorch.DataLoader(opts,
                                 IncrementDataset(shape, num_tensors),
                                 batch_size=1,
                                 num_workers=1,
                                 mode=poptorch.DataLoaderMode.Async)

    assert len(loader) == num_tensors

    for _, _ in enumerate(loader):
        break


def test_single_epoch():
    shape = [2, 3]
    num_tensors = 100

    opts = poptorch.Options()
    loader = poptorch.DataLoader(opts,
                                 IncrementDataset(shape, num_tensors),
                                 batch_size=1,
                                 num_workers=32,
                                 mode=poptorch.DataLoaderMode.Async)

    assert len(loader) == num_tensors

    for _, _ in enumerate(loader):
        continue


def test_iterable_dataset():
    shape = [2, 3]
    num_tensors = 100

    loader = poptorch.AsynchronousDataAccessor(
        IncrementIterableDataset(shape, num_tensors))

    for _, _ in enumerate(loader):
        continue

    # Make sure it works for more than 1 epoch
    for _, _ in enumerate(loader):
        continue


def test_iterable_dataloader():
    shape = [2, 3]
    num_tensors = 100

    opts = poptorch.Options()
    loader = poptorch.DataLoader(opts,
                                 IncrementIterableDataset(shape, num_tensors),
                                 batch_size=1,
                                 num_workers=1,
                                 mode=poptorch.DataLoaderMode.Async)

    for _, t in enumerate(loader):
        assert t.shape == torch.Size([1, 2, 3])
        continue

    # Make sure it works for more than 1 epoch
    for _, _ in enumerate(loader):
        continue


@pytest.mark.parametrize("persistent_workers", {True, False})
def test_iterable_dataloader_reset(persistent_workers):
    shape = [2, 3]
    num_tensors = 10

    opts = poptorch.Options()
    loader = poptorch.DataLoader(opts,
                                 IncrementDataset(shape, num_tensors),
                                 persistent_workers=persistent_workers,
                                 batch_size=1,
                                 num_workers=1,
                                 mode=poptorch.DataLoaderMode.Async)

    # Interrupt the first iteration
    for i, t in enumerate(loader):
        assert t.shape == torch.Size([1, 2, 3])
        assert t[0][0][0] == i
        if i == 4:
            print(f"Last tensor first iteration {t}")
            break
        continue

    print("Second iterator")
    # Make sure the second iteration returns all the tensors
    for i, t in enumerate(loader):
        assert t[0][0][0] == i
    assert i == (num_tensors - 1)


def test_early_preload():
    shape = [2, 3]
    num_tensors = 10
    num_buffers = 5

    opts = poptorch.Options()
    data = IncrementDataset(shape, num_tensors)

    async_opts_preload = {'early_preload': True, 'buffer_size': num_buffers}
    async_opts_no_preload = {
        'early_preload': False,
        'buffer_size': num_buffers
    }
    dataloader_args = {
        'options': opts,
        'dataset': data,
        'batch_size': 1,
        'num_workers': 1
    }

    preload = poptorch.DataLoader(**dataloader_args,
                                  mode=poptorch.DataLoaderMode.Async,
                                  async_options=async_opts_preload)
    no_preload = poptorch.DataLoader(**dataloader_args,
                                     mode=poptorch.DataLoaderMode.Async,
                                     async_options=async_opts_no_preload)

    time.sleep(2)  # Give time for the worker to fill the buffer

    assert sum(no_preload._accessor._worker._data_buffers.indices_mem) == 1  # pylint: disable=protected-access, no-member
    assert sum(
        preload._accessor._worker._data_buffers.indices_mem) == num_buffers  # pylint: disable=protected-access, no-member


def test_batch_size_None():
    shape = [2, 3]
    num_tensors = 10

    opts = poptorch.Options()
    loader = poptorch.DataLoader(opts,
                                 IncrementIterableDataset(shape, num_tensors),
                                 batch_size=None,
                                 drop_last=False,
                                 num_workers=1,
                                 mode=poptorch.DataLoaderMode.Async)

    for _, t in enumerate(loader):
        assert t.shape == torch.Size([2, 3])
        continue

    # Make sure it works for more than 1 epoch
    for _, _ in enumerate(loader):
        continue


def test_iterable_dataset_len():
    shape = [2, 3]
    num_tensors = 10

    opts = poptorch.Options()
    loader = poptorch.DataLoader(opts,
                                 IncrementIterableDataset(shape, num_tensors),
                                 batch_size=None,
                                 drop_last=False,
                                 num_workers=1,
                                 mode=poptorch.DataLoaderMode.Async)

    with pytest.raises(TypeError,
                       match="'IncrementIterableDataset' has no len()"):
        len(loader)
    loader = poptorch.DataLoader(opts,
                                 IncrementIterableDatasetWithLen(
                                     shape, num_tensors),
                                 batch_size=None,
                                 drop_last=False,
                                 num_workers=1,
                                 mode=poptorch.DataLoaderMode.Async)

    len(loader)


def test_broken_dataset():
    num_tensors = 100

    opts = poptorch.Options()
    data = poptorch.DataLoader(opts,
                               BrokenDataset(num_tensors),
                               batch_size=1,
                               num_workers=32)

    with pytest.raises(poptorch.Error, match="worker thread failed to start"):
        poptorch.AsynchronousDataAccessor(data)


def test_subprocess_async_loader():
    print(subprocess.check_output(
        ["python3", "-m", "pytest", __file__, "-k", "test_single_epoch"],
        stderr=subprocess.STDOUT).decode('utf-8'),
          flush=True)


def test_subprocess_broken_dataset():
    stdout = subprocess.check_output([
        "python3", "-m", "pytest", __file__, "-k", "test_broken_dataset", "-s"
    ],
                                     stderr=subprocess.STDOUT).decode('utf-8')
    print(stdout)
    assert "AssertionError: Broken dataset" in stdout, (
        "Couldn't find failure "
        "reason in stdout")


@pytest.mark.parametrize("DatasetType",
                         [IncrementDataset, IncrementIterableDataset])
@pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
def test_reuse_workers(DatasetType, dtype):
    shape = [2, 3]
    num_tensors = 10

    opts = poptorch.Options()
    loader = poptorch.DataLoader(opts,
                                 DatasetType(shape, num_tensors, dtype=dtype),
                                 batch_size=1,
                                 num_workers=2,
                                 mode=poptorch.DataLoaderMode.Async)
    loader_no_reuse = poptorch.DataLoader(opts,
                                          DatasetType(shape,
                                                      num_tensors,
                                                      dtype=dtype),
                                          batch_size=1,
                                          persistent_workers=False,
                                          num_workers=2,
                                          mode=poptorch.DataLoaderMode.Async)

    # Workers are created when the AsynchronousDataAccessor is instantiated
    # So the first iteration should be fast
    num_tensors = 0
    start = time.perf_counter()
    for _ in loader_no_reuse:
        num_tensors += 1
    end = time.perf_counter()
    print(f"First epoch no reuse: {end - start} {num_tensors}")

    # subsequent iterations will join and create new workers
    # when a new iterator is created.
    for _ in range(3):
        start = time.perf_counter()
        for _ in loader_no_reuse:
            num_tensors += 1
        end = time.perf_counter()
        print(f"Other epoch no reuse: {end - start}  {num_tensors}")

    start = time.perf_counter()
    num_tensors_reuse = 0
    for _ in loader:
        num_tensors_reuse += 1
    end = time.perf_counter()
    print(f"First epoch: {end - start} {num_tensors_reuse}")

    for _ in range(3):
        start = time.perf_counter()
        for _ in loader:
            num_tensors_reuse += 1
        end = time.perf_counter()
        print(f"Other epoch: {end - start} {num_tensors_reuse}")

    assert num_tensors_reuse == num_tensors


# Select a subset of the dataset for each worker
def _worker_init_fn(worker_id):
    worker_info = torch.utils.data.get_worker_info()
    dataset = worker_info.dataset
    total_len = dataset.length
    per_worker = math.ceil(dataset.length / worker_info.num_workers)
    dataset.start = per_worker * worker_id
    if worker_id == worker_info.num_workers - 1:
        dataset.length = total_len - (per_worker *
                                      (worker_info.num_workers - 1))
    else:
        dataset.length = per_worker


@pytest.mark.parametrize(
    "mode", {
        poptorch.DataLoaderMode.Async, poptorch.DataLoaderMode.AsyncRebatched,
        poptorch.DataLoaderMode.Sync
    })
@pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
def test_iterable_dataloader_drop_last(mode, dtype):
    shape = [2, 3]
    num_tensors = 101
    num_workers = 7
    batch_size = 4
    if mode != poptorch.DataLoaderMode.AsyncRebatched:
        # Expected tensors
        # tensors per worker = ceil(101/7) = 15
        # last worker = 10 tensor
        # batch size = 4
        # Total = 6 * floor(15 / 4) + floor(10/4)
        #       = 6 * 3 + 2 = 20
        # Unused tensors = 101 - num_expected * 4 = 21
        num_expected = 20 * batch_size
    else:
        # Best case expected: floor(101/4) = 25 -> unused = 1
        num_expected = math.floor(num_tensors / batch_size) * batch_size

    opts = poptorch.Options()
    loader = poptorch.DataLoader(opts,
                                 IncrementIterableDataset(shape,
                                                          num_tensors,
                                                          dtype=dtype),
                                 batch_size=batch_size,
                                 num_workers=num_workers,
                                 mode=mode,
                                 drop_last=True,
                                 worker_init_fn=_worker_init_fn)

    values = set()
    for t in loader:
        assert t.shape == torch.Size([4, 2, 3])
        for b in t:
            v = int(b[0][0])
            assert v not in values
            values.add(v)

    assert len(values) == num_expected
    print("Missing tensors:")
    for i in range(num_tensors):
        if i not in values:
            print(i)

    # Make sure it works for more than 1 epoch
    values = set()
    for t in loader:
        assert t.shape == torch.Size([4, 2, 3])
        for b in t:
            v = int(b[0][0])
            assert v not in values
            values.add(v)

    assert len(values) == num_expected


@pytest.mark.parametrize(
    "mode", {
        poptorch.DataLoaderMode.Async, poptorch.DataLoaderMode.AsyncRebatched,
        poptorch.DataLoaderMode.Sync
    })
@pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
def test_indexable_dataloader_drop_last(mode, dtype):
    shape = [2, 3]
    num_tensors = 101
    num_workers = 7
    batch_size = 4
    # Expected tensors
    # Best case expected: floor(101/4) = 25 -> unused = 1
    num_expected = 100

    opts = poptorch.Options()
    loader = poptorch.DataLoader(opts,
                                 IncrementDataset(shape,
                                                  num_tensors,
                                                  dtype=dtype),
                                 batch_size=batch_size,
                                 num_workers=num_workers,
                                 mode=mode)

    values = set()
    for t in loader:
        assert t.shape == torch.Size([4, 2, 3])
        for b in t:
            v = int(b[0][0])
            assert v not in values
            values.add(v)

    assert len(values) == num_expected
    print("Missing tensors:")
    for i in range(num_tensors):
        if i not in values:
            print(i)

    # Make sure it works for more than 1 epoch
    values = set()
    for t in loader:
        assert t.shape == torch.Size([4, 2, 3])
        for b in t:
            v = int(b[0][0])
            assert v not in values
            values.add(v)

    assert len(values) == num_expected


@pytest.mark.parametrize(
    "mode", {
        poptorch.DataLoaderMode.Async, poptorch.DataLoaderMode.AsyncRebatched,
        poptorch.DataLoaderMode.Sync
    })
@pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
def test_indexable_dataloader_len(mode, dtype):
    shape = [2, 3]
    num_tensors = 101
    num_workers = 7
    batch_size = 4
    ds = IncrementDataset(shape, num_tensors, dtype=dtype)
    assert len(ds) == num_tensors
    n = 0
    for n, _ in enumerate(ds):
        pass
    assert n + 1 == num_tensors
    opts = poptorch.Options()
    loader = poptorch.DataLoader(opts,
                                 ds,
                                 batch_size=batch_size,
                                 num_workers=num_workers,
                                 mode=mode)
    if mode == poptorch.DataLoaderMode.Sync:
        # Make sure the user can still manually create the
        # data accessor. (This can only be tested in Sync
        # mode as otherwise the loader already contains
        # a data accessor).
        accessor = poptorch.AsynchronousDataAccessor(loader)
        assert len(loader) == num_tensors // batch_size
        for n, _ in enumerate(accessor):
            pass
        assert n + 1 == num_tensors // batch_size
        accessor = poptorch.AsynchronousDataAccessor(ds)
        assert len(accessor) == num_tensors
        for n, _ in enumerate(accessor):
            pass
        assert n + 1 == num_tensors

    assert len(loader) == num_tensors // batch_size
    for n, _ in enumerate(loader):
        pass
    assert n + 1 == num_tensors // batch_size


@pytest.mark.parametrize(
    "mode", {
        poptorch.DataLoaderMode.Async, poptorch.DataLoaderMode.AsyncRebatched,
        poptorch.DataLoaderMode.Sync
    })
def test_dictionary_dataset(mode):
    shape = [2, 3]
    num_tensors = 500

    opts = poptorch.Options()
    opts.deviceIterations(2)
    opts.replicationFactor(3)

    loader = poptorch.DataLoader(opts,
                                 IncrementDatasetWithLabelsDict(
                                     shape, num_tensors),
                                 num_workers=3,
                                 mode=mode)
    shape_with_batch = [loader.combinedBatchSize] + shape
    it = 0
    for d in loader:
        assert isinstance(d, dict)
        assert len(d) == 2
        assert "data" in d
        assert "label" in d
        assert d["data"].shape == torch.Size(shape_with_batch)
        assert d["label"].shape == torch.Size([loader.combinedBatchSize, 1])
        it += 1

    assert it == num_tensors // loader.combinedBatchSize


@pytest.mark.parametrize(
    "mode", {
        poptorch.DataLoaderMode.Async, poptorch.DataLoaderMode.AsyncRebatched,
        poptorch.DataLoaderMode.Sync
    })
@pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
def test_iterable_dataloader_len(mode, dtype):
    shape = [2, 3]
    num_tensors = 101
    num_workers = 7
    batch_size = 4
    # Note: Upstream torch returns the theoretical length
    # it doesn't take into account the items lost per worker.
    expected_len = math.floor(num_tensors / batch_size)
    if mode != poptorch.DataLoaderMode.AsyncRebatched:
        # Expected tensors
        # tensors per worker = ceil(101/7) = 15
        # last worker = 10 tensor
        # batch size = 4
        # Total = 6 * floor(15 / 4) + floor(10/4)
        #       = 6 * 3 + 2 = 20
        # Unused tensors = 101 - num_iterations_expected * 4 = 21
        num_iterations_expected = 20
    else:
        # Best case expected: floor(101/4) = 25 -> unused = 1
        num_iterations_expected = expected_len
    ds = IncrementIterableDatasetWithLen(shape, num_tensors, dtype=dtype)
    assert len(ds) == num_tensors
    n = 0
    for n, _ in enumerate(ds):
        pass
    assert n + 1 == num_tensors
    opts = poptorch.Options()
    loader = poptorch.DataLoader(opts,
                                 ds,
                                 batch_size=batch_size,
                                 num_workers=num_workers,
                                 worker_init_fn=_worker_init_fn,
                                 mode=mode)
    if mode == poptorch.DataLoaderMode.Sync:
        accessor = poptorch.AsynchronousDataAccessor(loader)
        assert len(loader) == expected_len
        for n, _ in enumerate(accessor):
            pass
        assert n + 1 == num_iterations_expected
        accessor = poptorch.AsynchronousDataAccessor(ds)
        assert len(accessor) == num_tensors
        for n, _ in enumerate(accessor):
            pass
        assert n + 1 == num_tensors

    assert len(loader) == expected_len
    for n, _ in enumerate(loader):
        pass
    assert n + 1 == num_iterations_expected


@pytest.mark.parametrize(
    "mode",
    {poptorch.DataLoaderMode.AsyncRebatched, poptorch.DataLoaderMode.Sync})
@pytest.mark.parametrize("DatasetType",
                         [IncrementDataset, IncrementIterableDatasetWithLen])
@pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
def test_leftover(mode, DatasetType, dtype):
    shape = [2, 3]
    num_tensors = 101
    num_workers = 7
    batch_size = 6
    # Note: Upstream torch returns the theoretical length
    # it doesn't take into account the items lost per worker.
    expected_len = math.ceil(num_tensors / batch_size)

    ds = DatasetType(shape, num_tensors, dtype=dtype)
    if isinstance(ds, torch.utils.data.IterableDataset
                  ) and mode != poptorch.DataLoaderMode.AsyncRebatched:
        # Expected tensors
        # tensors per worker = ceil(101/7) = 15
        # last worker = 11 tensor
        # batch size = 6
        # Total = 6 * floor(15 / 6) + floor(11/6)
        #       = 6 * 2 + 1 = 13
        # Left over per worker: 3, 5 for the first one
        num_full_iterations_expected = 13
        left_over_batches = [5] + [3] * 6
    else:
        # Best case expected: floor(101/6) = 16 -> unused = 5
        num_full_iterations_expected = 16
        left_over_batches = [5]
    assert len(ds) == num_tensors
    n = 0
    for n, d in enumerate(ds):
        assert d.shape == torch.Size(shape)

    assert n + 1 == num_tensors
    opts = poptorch.Options()
    worker_init_fn = None
    if isinstance(ds, torch.utils.data.IterableDataset):
        worker_init_fn = _worker_init_fn
    loader = poptorch.DataLoader(opts,
                                 ds,
                                 batch_size=batch_size,
                                 num_workers=num_workers,
                                 worker_init_fn=worker_init_fn,
                                 drop_last=False,
                                 mode=mode)

    assert len(loader) == expected_len
    for _ in range(2):
        # There is no guarantee about the order in which
        # the full vs partial batches will be returned
        # so we need to keep track of which ones we've seen so far
        # and assert at the end.
        full_iterations_left = num_full_iterations_expected
        left_overs_left = left_over_batches.copy()

        for n, d in enumerate(loader):
            print("Dequeued tensor shape ", d.shape)
            if d.shape[0] == batch_size:
                full_iterations_left -= 1
            else:
                assert d.shape[0] in left_overs_left
                left_overs_left.remove(d.shape[0])

        num_iterations_expected = num_full_iterations_expected + len(
            left_over_batches)
        assert full_iterations_left == 0
        assert not left_overs_left
        assert n + 1 == num_iterations_expected


@pytest.mark.parametrize("DatasetType",
                         [IncrementDataset, IncrementIterableDatasetWithLen])
@pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
@pytest.mark.parametrize("drop_last", [True, False])
@pytest.mark.parametrize("rebatched_worker_size", [1, 2, 3, 4, None])
def test_rebatched_worker_size(DatasetType, dtype, drop_last,
                               rebatched_worker_size):
    shape = [2, 3]
    num_tensors = 101
    num_workers = 7
    batch_size = 4
    ds = DatasetType(shape, num_tensors, dtype=dtype)
    worker_init_fn = None
    if isinstance(ds, torch.utils.data.IterableDataset):
        worker_init_fn = _worker_init_fn

    if drop_last:
        # Best case expected: floor(101/4) = 25 -> unused = 1
        num_expected = math.floor(num_tensors / batch_size) * batch_size
    else:
        num_expected = num_tensors

    opts = poptorch.Options()
    loader = poptorch.DataLoader(opts,
                                 ds,
                                 batch_size=batch_size,
                                 num_workers=num_workers,
                                 mode=poptorch.DataLoaderMode.AsyncRebatched,
                                 drop_last=drop_last,
                                 rebatched_worker_size=rebatched_worker_size,
                                 worker_init_fn=worker_init_fn)

    values = set()
    for t in loader:
        assert not drop_last or t.shape == torch.Size([4, 2, 3])
        for b in t:
            v = int(b[0][0])
            assert v not in values
            values.add(v)

    assert len(values) == num_expected
    print("Missing tensors:")
    for i in range(num_tensors):
        if i not in values:
            print(i)

    # Make sure it works for more than 1 epoch
    values = set()
    for t in loader:
        assert not drop_last or t.shape == torch.Size([4, 2, 3])
        for b in t:
            v = int(b[0][0])
            assert v not in values
            values.add(v)

    assert len(values) == num_expected


def process_to_kill_asyncdataloader(iterate_over_data: bool):
    """A function executed as a script meant to be killed
    ``test_KeyboardInterrupt_in_async_data_accessor``
    Creates a dataloader and iterates over it.
    """
    # pylint: disable=import-outside-toplevel
    # pylint: disable=reimported
    import time
    import poptorch
    import torch

    opts = poptorch.Options()
    opts.deviceIterations(2)
    opts.replicationFactor(1)
    features = torch.randn([100, 1, 128, 128])
    labels = torch.empty([100], dtype=torch.long).random_(10)
    dataset = torch.utils.data.TensorDataset(features, labels)
    training_data = poptorch.DataLoader(
        opts,
        dataset=dataset,
        batch_size=16,
        shuffle=True,
        drop_last=True,
        num_workers=2,
        mode=poptorch.DataLoaderMode.Async,
    )
    # Empty iteration through the data alters the state of the accessor
    if iterate_over_data:
        for _, _ in training_data:
            pass
    # Needed as a cooldown after the iteration, otherwise the accessor
    # may be in an unsafe state, this is representative of interractive
    # environments.
    time.sleep(1)
    print("[control] Dataloader prepared, waiting for sigint.")

    # Expect the parent process to be force closed in the next 30 seconds
    try:
        time.sleep(30)
        raise RuntimeError(
            "We should not reach this point, we should receive SIGINT before")
    except KeyboardInterrupt:
        print("[control] KeyboardInterrupt received in parent exiting.")


@pytest.mark.parametrize("iterate_over_data", [True, False])
def test_KeyboardInterrupt_in_async_data_accessor(iterate_over_data: bool):
    """ Reproduces an error seen in Jupyter notebooks where dataloader
    Asynchronous Accessors get closed before their controller. Leading
    to error messages being spawned to the notebook command line.

    :args: iterate_over_data: Argument passed to
        ``process_to_kill_asyncdataloader``. Indicates whether to iterate over
        the data or not.
    """
    print("Starting subprocess")
    parent = subprocess.Popen(
        [
            sys.executable,
            "-u",  # needed to ensure messages are sent to stdout immediately
            "-c",
            f"""
import os
# Needed to capture the PID of the AsynchronousDataAccessor
os.environ["POPTORCH_LOG_LEVEL"] = "DEBUG"
import marshal, types
code = marshal.loads({marshal.dumps(process_to_kill_asyncdataloader.__code__)})
fn = types.FunctionType(code, globals(), "kill_this_process")
fn({iterate_over_data})
            """,
        ],
        universal_newlines=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
    )
    print("Subprocess started - waiting for signal")

    lines = []
    worker_pid = None
    kill_worker = False
    # Capture the PID of AsynchronousDataAccessor and wait for the signal
    # that the dataloader is ready.
    for line in parent.stdout:
        lines.append(line)
        print("Child - {}".format(line.strip("\n")))
        find_pid = re.match(
            r".*AsynchronousDataAccessor worker process: (\d+)", line)
        if find_pid:
            worker_pid = int(find_pid.group(1))
        if re.match(r"\[control\] Dataloader prepared, waiting for sigint\.",
                    line):
            kill_worker = True
            break

    # Check that both the PID and the signal were caught
    if not kill_worker:
        parent.send_signal(signal.SIGINT)
        raise RuntimeError("The termination signal for the worker process "
                           "was not received.")
    if worker_pid is None:
        parent.send_signal(signal.SIGINT)
        raise RuntimeError(
            "Could not kill the AsynchronousDataAccessor, its "
            "PID could not be captured from the standard output.")

    print("Sending SIGINT to ", worker_pid)
    os.kill(worker_pid, signal.SIGINT)
    parent.send_signal(signal.SIGINT)

    for line in parent.stdout:
        lines.append(line)
        print("Child - {}".format(line.strip("\n")))

    unexpected_lines = [
        line for line in lines
        if "[debug]" not in line and "[control]" not in line
    ]
    assert not unexpected_lines, "Unexpected lines in output:\n%s" % "".join(
        unexpected_lines)


class DynamicBatchSampler(torch.utils.data.Sampler):
    def __init__(self, sampler, batch_size):
        super().__init__(None)
        self.sampler = sampler
        self.batch_size = batch_size

    def __iter__(self):

        indices = []

        idx = 0
        reset = 1
        for sample in self.sampler:
            if idx == reset:
                yield indices
                indices = []
                idx = 0
                reset += 1
                if reset == self.batch_size + 1:
                    reset = 1

            indices.append(sample)
            idx += 1

        if indices:
            yield indices

    @functools.lru_cache(None)
    def __len__(self):
        sampler_len = len(self.sampler)
        bins = 0
        bins_elems = ((2 + (self.batch_size - 1)) * self.batch_size) // 2
        bins += (sampler_len // bins_elems) * (self.batch_size)

        sampler_len = sampler_len % bins_elems

        if not sampler_len:
            return bins

        bins_elems -= self.batch_size

        for bin_size in reversed(range(1, self.batch_size)):
            if sampler_len == bins_elems:
                return bins + bin_size
            if sampler_len > bins_elems:
                return bins + bin_size + 1

            bins_elems -= bin_size

        return bins


class DynamicRandomBatchSampler(torch.utils.data.Sampler):
    def __init__(self, sampler, batch_size):
        super().__init__(None)
        self.sampler = sampler
        self.batch_size = batch_size

    def __iter__(self):
        random.seed(self.batch_size)
        length = len(self.sampler)
        iterator = self.sampler.__iter__()

        while length:
            batch_length = random.randint(1, min(length, self.batch_size))
            indices = [iterator.__next__() for _ in range(batch_length)]
            yield indices
            length -= batch_length


class CustomBatch:
    def __init__(self, data, label):
        self.data = data
        self.label = label


class CustomBatchParser(poptorch.ICustomArgParser):
    def yieldTensors(self, struct):
        yield struct.data
        yield struct.label

    def reconstruct(self, original_structure, tensor_iterator):
        return type(original_structure)(*tensor_iterator)


poptorch.registerCustomArgParser(CustomBatch, CustomBatchParser())


class DynamicPadCollateFunction():
    def __init__(self, batch_size, return_type=None):
        self.batch_size = batch_size
        self.return_type = return_type

    def __call__(self, collate_data_list):
        if isinstance(collate_data_list[0], tuple):
            pad_data_len = self.batch_size - len(collate_data_list)
            batch = []
            for index in range(len(collate_data_list[0])):
                elem_shape = collate_data_list[0][index].shape
                tensors = [data[index] for data in collate_data_list]
                tensors.extend([
                    torch.full(elem_shape, 0, dtype=torch.float32)
                    for _ in range(pad_data_len)
                ])
                batch.append(torch.stack(tensors))

            if self.return_type not in [None, tuple]:
                return self.return_type(*batch)
            return tuple(batch)
        raise NotImplementedError()


@pytest.mark.parametrize("batch_size", [2, 3])
@pytest.mark.parametrize("device_iteration", [1, 4, 5])
@pytest.mark.parametrize("drop_last", [True, False])
@pytest.mark.parametrize("num_workers", [1, 10])
def test_batch_sampler_basic(batch_size, device_iteration, drop_last,
                             num_workers):
    combined_batch_size = batch_size * device_iteration
    shape = [2, 3]
    dataset_size = 100
    dtype = torch.float32
    exepected_num_batches = dataset_size // combined_batch_size
    last_batch_incomplete_size = dataset_size % combined_batch_size
    last_incomplete = last_batch_incomplete_size != 0 and not drop_last
    if last_incomplete:
        exepected_num_batches += 1

    dataset = IncrementDataset(shape, dataset_size, dtype)
    simple_batch_sampler = torch.utils.data.BatchSampler(
        torch.utils.data.SequentialSampler(dataset),
        batch_size=batch_size,
        drop_last=drop_last)
    opts = poptorch.Options().deviceIterations(device_iteration)
    loader = poptorch.DataLoader(opts,
                                 dataset,
                                 batch_sampler=simple_batch_sampler,
                                 drop_last=drop_last,
                                 num_workers=num_workers)

    def expected_batch(batch_id, expected_batch_size):
        nonlocal combined_batch_size
        nonlocal shape
        nonlocal dtype
        index_base = batch_id * combined_batch_size
        return torch.stack([
            torch.full(shape, index_base + index, dtype=dtype)
            for index in range(expected_batch_size)
        ])

    batches = list(loader)
    assert len(batches) == exepected_num_batches
    expected_full_batch = functools.partial(
        expected_batch, expected_batch_size=combined_batch_size)

    full_batches = itertools.islice(
        batches,
        len(batches) - 1 if last_incomplete else None)

    batch_id = -1
    for batch_id, batch in enumerate(full_batches):
        assert torch.equal(batch, expected_full_batch(batch_id))

    if last_incomplete:
        assert torch.equal(
            batches[-1],
            expected_batch(batch_id + 1, last_batch_incomplete_size))


def get_item(batch, item, return_type):
    if return_type == tuple:
        if item == "data":
            return batch[0]
        return batch[1]

    if return_type == CustomBatch:
        return getattr(batch, item)

    return None


@pytest.mark.parametrize("batch_size", [4, 11])
@pytest.mark.parametrize("device_iteration", [1, 5])
@pytest.mark.parametrize("num_workers", [0, 3])
@pytest.mark.parametrize("return_type", [tuple, CustomBatch])
@pytest.mark.parametrize("drop_last", [True, False])
@pytest.mark.parametrize("mode", [
    poptorch.DataLoaderMode.Sync, poptorch.DataLoaderMode.Async,
    poptorch.DataLoaderMode.AsyncRebatched
])
def test_custom_batch_sampler(batch_size, device_iteration, num_workers,
                              return_type, drop_last, mode):

    shape = [3, 1]
    dataset_size = 149

    dataset = IncrementDatasetWithLabels(shape, dataset_size)
    dynamic_batch_sampler = DynamicBatchSampler(
        torch.utils.data.SequentialSampler(dataset), batch_size=batch_size)

    sampler_len = len(dynamic_batch_sampler)
    expected_num_batches = sampler_len // device_iteration
    incomplete_batches = sampler_len % device_iteration

    last_incomplete = not drop_last and incomplete_batches != 0
    if last_incomplete:
        expected_num_batches += 1

    collate_fn = DynamicPadCollateFunction(batch_size, return_type)
    opts = poptorch.Options().deviceIterations(device_iteration)

    loader = poptorch.DataLoader(opts,
                                 dataset,
                                 batch_sampler=dynamic_batch_sampler,
                                 collate_fn=collate_fn,
                                 num_workers=num_workers,
                                 drop_last=drop_last,
                                 mode=mode)

    batches = list(loader)
    assert len(batches) == expected_num_batches

    combined_batch_size = batch_size * device_iteration
    expected_data_full_size = torch.Size([combined_batch_size] + shape)
    expected_labels_full_size = torch.Size([combined_batch_size, 1])

    full_batches = itertools.islice(
        batches,
        len(batches) - 1 if last_incomplete else None)

    for batch in full_batches:
        assert get_item(batch, "data",
                        return_type).shape == expected_data_full_size
        assert get_item(batch, "label",
                        return_type).shape == expected_labels_full_size

    if last_incomplete:
        combined_tail_batch_size = incomplete_batches * batch_size
        assert get_item(batches[-1], "data", return_type).shape  == \
            torch.Size([combined_tail_batch_size] + shape)
        assert get_item(batches[-1], "label", return_type).shape  == \
            torch.Size([combined_tail_batch_size, 1])


@pytest.mark.parametrize("batch_size", [1, 4])
@pytest.mark.parametrize("drop_last", [True, False])
def test_default_batch_sampler(batch_size, drop_last):
    device_iteration = 1
    num_workers = 4
    return_type = tuple
    mode = poptorch.DataLoaderMode.Async

    shape = [3, 1]

    # pseudo random value for number of expected batches produced by dataloader
    expected_num_batches = batch_size + device_iteration + num_workers
    incomplete_batches = 3 % device_iteration

    combined_batch_size = batch_size * device_iteration
    dataset_size = expected_num_batches * combined_batch_size
    dataset_size += incomplete_batches * batch_size

    last_incomplete = not drop_last and incomplete_batches != 0
    if last_incomplete:
        expected_num_batches += 1

    collate_fn = DynamicPadCollateFunction(batch_size, return_type)
    opts = poptorch.Options().deviceIterations(device_iteration)

    dataset = IncrementDatasetWithLabels(shape, dataset_size)

    loader = poptorch.DataLoader(opts,
                                 dataset,
                                 batch_size=batch_size,
                                 collate_fn=collate_fn,
                                 num_workers=num_workers,
                                 drop_last=drop_last,
                                 mode=mode)

    batches = list(loader)
    assert len(batches) == expected_num_batches

    expected_data_full_size = torch.Size([combined_batch_size] + shape)
    expected_labels_full_size = torch.Size([combined_batch_size, 1])

    full_batches = itertools.islice(
        batches,
        len(batches) - 1 if last_incomplete else None)

    for batch in full_batches:
        assert get_item(batch, "data",
                        return_type).shape == expected_data_full_size
        assert get_item(batch, "label",
                        return_type).shape == expected_labels_full_size

    if last_incomplete:
        combined_tail_batch_size = incomplete_batches * batch_size
        assert get_item(batches[-1], "data", return_type).shape  == \
            torch.Size([combined_tail_batch_size] + shape)
        assert get_item(batches[-1], "label", return_type).shape  == \
            torch.Size([combined_tail_batch_size, 1])


@pytest.mark.parametrize("device_iteration", [1, 5])
@pytest.mark.parametrize("num_workers", [0, 3])
def test_custom_batch_sampler_non_deterministic_len(device_iteration,
                                                    num_workers):
    shape = [2, 1]
    dataset_size = 111
    batch_size = 13

    dataset = IncrementDatasetWithLabels(shape, dataset_size)
    dynamic_batch_sampler = DynamicRandomBatchSampler(
        torch.utils.data.SequentialSampler(dataset), batch_size=batch_size)

    collate_fn = DynamicPadCollateFunction(batch_size, CustomBatch)
    opts = poptorch.Options().deviceIterations(device_iteration)
    loader = poptorch.DataLoader(opts,
                                 dataset,
                                 batch_sampler=dynamic_batch_sampler,
                                 collate_fn=collate_fn,
                                 num_workers=num_workers)

    batches = list(loader)
    assert len(batches) > 0

    combined_batch_size = batch_size * device_iteration
    expected_data_full_size = torch.Size([combined_batch_size] + shape)
    expected_labels_full_size = torch.Size([combined_batch_size, 1])

    for batch in batches:
        assert batch.data.shape == expected_data_full_size
        assert batch.label.shape == expected_labels_full_size


================================================
FILE: tests/debug_tensors_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.

import os
import re
import tempfile
import pytest
import torch
import helpers
import poptorch


class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = torch.nn.Linear(10, 10)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(10, 10)
        self.loss = torch.nn.MSELoss(reduction="mean")

    def forward(self, x, labels=None):
        out = self.fc2(self.relu(self.fc1(x)))
        if self.training:
            return self.loss(out, labels)
        return out


def test_tensor_names():
    model = Model()
    poptorch_model = poptorch.trainingModel(model)
    input = torch.rand(10, 10)
    label = torch.rand(10, 10)

    with pytest.raises(AssertionError):
        poptorch_model.getTensorNames()

    poptorch_model(input, label)
    tensors = poptorch_model.getTensorNames()

    assert any(re.search(r"\bfc1\b", t) for t in tensors)
    assert any(re.search(r"\bfc2\b", t) for t in tensors)
    assert any(t.startswith('input') for t in tensors)
    assert any(t.startswith('loss') for t in tensors)
    assert any(t.startswith('Gradient___') for t in tensors)
    assert any(t.startswith('UpdatedVar__') for t in tensors)
    assert any(t.startswith('scaledLearningRate') for t in tensors)
    assert any(t.startswith('weightDecayScaleFactor') for t in tensors)


@pytest.mark.ipuHardwareRequired
def test_tensor_names_from_precompiled_model():
    with tempfile.TemporaryDirectory() as tmp:
        filename = os.path.join(tmp, "model.poptorch")
        model = Model()
        poptorch_model = poptorch.trainingModel(model)
        input = torch.rand(10, 10)
        label = torch.rand(10, 10)

        # Running the model will trigger the executable compilation
        poptorch_model(input, label)
        # Save the executable and destroy the model
        poptorch_model.save(filename)
        poptorch_model.destroy()

        with pytest.raises(AssertionError):
            poptorch_model.getTensorNames()

        # Reload the model from file.
        poptorch_model = poptorch.load(filename)

        tensors = poptorch_model.getTensorNames()

        assert any(re.search(r"\bfc1\b", t) for t in tensors)
        assert any(re.search(r"\bfc2\b", t) for t in tensors)
        assert any(t.startswith('input') for t in tensors)
        assert any(t.startswith('loss') for t in tensors)
        assert any(t.startswith('weightDecayScaleFactor') for t in tensors)
        assert any(t.startswith('scaledLearningRate') for t in tensors)


def test_tensor_values():
    model = Model()

    opts = poptorch.Options()
    opts.anchorTensor('grad_bias', 'Gradient___fc2.bias')
    opts.anchorTensor('update_weight', 'UpdatedVar___fc2.weight')
    poptorch_model = poptorch.trainingModel(model, opts)

    input = torch.rand(10, 10)
    label = torch.rand(10, 10)
    poptorch_model(input, label)

    grad1 = poptorch_model.getAnchoredTensor('grad_bias')
    assert grad1.shape == (10, )
    update1 = poptorch_model.getAnchoredTensor('update_weight')
    assert update1.shape == (10, 10)

    input = torch.rand(10, 10)
    label = torch.rand(10, 10)
    poptorch_model(input, label)

    grad2 = poptorch_model.getAnchoredTensor('grad_bias')
    assert grad2.shape == (10, )
    update2 = poptorch_model.getAnchoredTensor('update_weight')
    assert update2.shape == (10, 10)

    assert not torch.equal(grad1, grad2)
    assert not torch.equal(update1, update2)


output_modes = [[poptorch.OutputMode.All, 3, "ALL/1"],
                [poptorch.OutputMode.EveryN, 4, "EVERYN/4"],
                [poptorch.OutputMode.Final, 1, "FINAL/1"],
                [poptorch.OutputMode.Sum, 1, "Sum/1"]]


@pytest.mark.parametrize("mode, period, expected_str", output_modes)
@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("DEBUG")
def test_tensor_modes(capfd, mode, period, expected_str):
    model = Model()
    tensor_name = 'Gradient___fc2.bias'

    opts = poptorch.Options()
    opts.anchorTensor('grad_bias', tensor_name, mode, period)

    poptorch_model = poptorch.trainingModel(model, opts)

    input = torch.rand(10, 10)
    label = torch.rand(10, 10)
    poptorch_model(input, label)

    testlog = helpers.LogChecker(capfd)
    testlog.assert_contains(tensor_name + ' ' + expected_str)


================================================
FILE: tests/distance_ops_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import torch
import pytest
import helpers
import poptorch


@pytest.mark.parametrize("norm", {1., 2., 3., 4.})
def test_pairwise_distance(norm):
    torch.manual_seed(42)

    size = [10, 5]
    input1 = torch.randn(size)
    input2 = torch.randn(size)
    shape = input1.shape

    model = helpers.ModelWithWeights(torch.nn.PairwiseDistance(norm), shape)
    poptorch_model = poptorch.trainingModel(model)

    # Run on CPU
    native_out, _ = model((input1, input2))

    # Run on IPU
    poptorch_out, _ = poptorch_model((input1, input2))

    # Inference test - check outputs
    helpers.assert_allclose(expected=native_out, actual=poptorch_out)

    # Training test - check weights changed
    poptorch_model.assert_weights_changed()


@pytest.mark.parametrize("dim", {0, 1})
def test_cosine_similarity(dim):
    torch.manual_seed(42)

    size = [10, 5]
    input1 = torch.randn(size)
    input2 = torch.randn(size)
    shape = input1.shape

    model = helpers.ModelWithWeights(torch.nn.CosineSimilarity(dim), shape)
    poptorch_model = poptorch.trainingModel(model)

    # Run on CPU
    native_out, _ = model((input1, input2))

    # Run on IPU
    poptorch_out, _ = poptorch_model((input1, input2))

    # Inference test - check outputs
    helpers.assert_allclose(expected=native_out, actual=poptorch_out)

    # Training test - check weights changed
    poptorch_model.assert_weights_changed()


@pytest.mark.parametrize("input_shapes",
                         (((3, 2), (2, 2)), ((3, 2, 3), (3, 10, 3)),
                          ((3, 5, 2, 7), (5, 11, 7)), ((3, 5, 1, 2, 7),
                                                       (3, 1, 10, 11, 7))))
@pytest.mark.parametrize("p", (2, 3))
def test_cdist(input_shapes, p):
    a_shape, b_shape = input_shapes

    torch.manual_seed(42)

    class Cdist(torch.nn.Module):
        def __init__(self, p, *args, **kwargs) -> None:
            super().__init__(*args, **kwargs)
            self.p = p

        def forward(self, x, y):
            return torch.cdist(x, y, self.p)

    a = torch.rand(*a_shape)
    b = torch.rand(*b_shape)

    model = helpers.ModelWithWeights(Cdist(p), a.shape)

    # Run on CPU
    native_out, _ = model((a, b))

    poptorch_model = poptorch.trainingModel(model)

    # Run on IPU
    poptorch_out, _ = poptorch_model((a, b))

    # Inference test - check outputs
    helpers.assert_allclose(expected=native_out, actual=poptorch_out)

    # Training test - check weights changed
    poptorch_model.assert_weights_changed()


================================================
FILE: tests/exception_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import pytest
import torch
import poptorch


def harness(setting, Model, args):
    opts = poptorch.Options()
    if setting == "true":
        opts.Precision.enableFloatingPointExceptions(True)
    elif setting == "false":
        opts.Precision.enableFloatingPointExceptions(False)

    poptorch_model = poptorch.inferenceModel(Model(), opts)

    if setting == "true":
        with pytest.raises(poptorch.Error):
            poptorch_model(*args)
    else:
        poptorch_model(*args)


@pytest.mark.ipuHardwareRequired
@pytest.mark.parametrize("setting", {"default", "true", "false"})
def test_div0(setting):
    class Model(torch.nn.Module):
        def forward(self, x, y):
            return x / y

    x = torch.ones(10, 10)
    y = torch.zeros(10, 10)
    harness(setting, Model, [x, y])


@pytest.mark.ipuHardwareRequired
@pytest.mark.parametrize("setting", {"default", "true", "false"})
def test_mul0inf(setting):
    class Model(torch.nn.Module):
        def forward(self, x, y):
            return x * y

    x = torch.zeros(10, 10)
    y = torch.div(torch.ones(10, 10), torch.zeros(10, 10))
    harness(setting, Model, [x, y])


@pytest.mark.ipuHardwareRequired
@pytest.mark.parametrize("setting", {"default", "true", "false"})
def test_nonreal(setting):
    class Model(torch.nn.Module):
        def forward(self, x):
            return torch.sqrt(x)

    x = torch.Tensor([-1, -2])
    harness(setting, Model, [x])


@pytest.mark.parametrize("setting", {"default", "true", "false"})
@pytest.mark.ipuHardwareRequired
def test_nan(setting):
    class Model(torch.nn.Module):
        def forward(self, x, y):
            return x > y

    x = torch.ones(10, 10)
    y = torch.div(torch.zeros(10, 10), torch.zeros(10, 10))
    harness(setting, Model, [x, y])


@pytest.mark.parametrize("setting", {"default", "true", "false"})
@pytest.mark.ipuHardwareRequired
def test_ovf(setting):
    class Model(torch.nn.Module):
        def forward(self, x):
            return torch.exp(x)

    x = torch.Tensor([3800, 4203])
    harness(setting, Model, [x])


================================================
FILE: tests/fine_tuning_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.
import copy
import os  # pylint: disable=unused-import
import unittest.mock
import torch
import torchvision.models as models
import helpers
import poptorch


def fine_tuning_harness(imagenet_model):
    torch.manual_seed(42)

    num_classes = 2
    num_examples = 2
    num_epochs = 20

    data = torch.randn((num_examples, 3, 224, 224))
    target = torch.randint(0, num_classes, (num_examples, ))

    base_model = imagenet_model(pretrained=False)

    loss_fn = torch.nn.CrossEntropyLoss()

    class ModelWithLoss(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.base_model = base_model

        def forward(self, data, target):
            out = base_model(data)
            loss = loss_fn(out, target)
            return out, loss

    model = ModelWithLoss()

    for param in model.base_model.parameters():
        param.requires_grad = False

    # Change the linear classifier at the top.
    model.base_model.fc = torch.nn.Linear(model.base_model.fc.in_features,
                                          num_classes)
    for param in model.base_model.fc.parameters():
        assert param.requires_grad
    initial_params = copy.deepcopy(model).state_dict()

    # Fine tune.
    optim = torch.optim.SGD(model.base_model.fc.parameters(), lr=0.001)

    poptorch_model = poptorch.trainingModel(model, optimizer=optim)

    for _ in range(num_epochs):
        _ = poptorch_model(data, target)

    # Assert only the last layer was changed.
    for name, param in model.named_parameters():
        if name.startswith('base_model.fc'):
            assert not torch.allclose(param.data, initial_params[name])
        else:
            helpers.assert_allclose(actual=param.data,
                                    expected=initial_params[name])


@unittest.mock.patch.dict("os.environ", helpers.disableSmallModel())
def test_resnet18():
    fine_tuning_harness(models.resnet18)


================================================
FILE: tests/functional_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.

import torch
import pytest
import helpers
import poptorch


def test_one_hot():
    class Model(torch.nn.Module):
        def forward(self, x):
            return torch.nn.functional.one_hot(x, num_classes=10)

    input = torch.randint(high=10, size=[10, 5, 4])
    model = Model()

    # Run on CPU.
    nativeOut = model(input)

    # Run on IPU.
    poptorch_model = poptorch.inferenceModel(model)
    poptorch_out = poptorch_model(input)

    helpers.assert_allequal(actual=poptorch_out.long(), expected=nativeOut)


def test_one_hot_invalid():
    class Model(torch.nn.Module):
        def forward(self, x):
            return torch.nn.functional.one_hot(x, num_classes=-1)

    input = torch.randint(high=10, size=[10])
    model = Model()

    msg = "OneHot num classes must be specified and must be constant."
    # Run on IPU.
    with pytest.raises(poptorch.Error, match=msg):
        poptorch_model = poptorch.inferenceModel(model)
        poptorch_model(input)


def test_one_hot_casted():
    class Model(torch.nn.Module):
        def forward(self, x):
            x = torch.nn.functional.one_hot(x, num_classes=10)
            return x.half()

    input = torch.randint(high=10, size=[10, 5, 4])
    model = Model()

    # Run on CPU.
    nativeOut = model(input)

    # Run on IPU.
    poptorch_model = poptorch.inferenceModel(model)
    poptorch_out = poptorch_model(input)

    assert poptorch_out.dtype == torch.half
    helpers.assert_allequal(actual=poptorch_out, expected=nativeOut)


@pytest.mark.parametrize("in_features,out_features", [(8, 7), (7, 6), (6, 5)])
def test_linear(in_features, out_features):
    class Model(torch.nn.Module):
        weight: torch.Tensor
        bias: torch.Tensor

        def __init__(self, in_features: int, out_features: int):
            super().__init__()
            self.weight = torch.nn.parameter.Parameter(
                torch.ones((out_features, in_features), dtype=torch.float))
            self.bias = torch.nn.parameter.Parameter(torch.ones(out_features))

        def forward(self, x):
            return torch.nn.functional.linear(x, self.weight, self.bias)

    input = torch.arange(out_features * in_features,
                         dtype=torch.float).reshape(out_features, in_features)
    model = Model(in_features=in_features, out_features=out_features)

    # Run on CPU.
    native_out = model(input)

    # Run on IPU.
    poptorch_model = poptorch.inferenceModel(model)
    poptorch_out = poptorch_model(input)

    assert poptorch_out.dtype == torch.float
    helpers.assert_allclose(actual=poptorch_out, expected=native_out)


================================================
FILE: tests/generate_test_file.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import argparse
import contextlib
import os
import io
import re
import sys
import pytest

parser = argparse.ArgumentParser(description="Generate CTestTestfile.cmake")
parser.add_argument("test_dir", help="Path to the folder containing the tests")
parser.add_argument("output_file", help="Path to CTestTestfile.cmake")
parser.add_argument("--add-to-sys-path", help="Path to add to sys.path")
parser.add_argument("--external-datasets-dir",
                    type=str,
                    help=("The directory where the external datasets will be "
                          "downloaded."))
parser.add_argument("--extra-pytest-args",
                    type=str,
                    help=("Extra arguments to pass to pytest when generating "
                          "the list of tests."))

args = parser.parse_args()

if args.add_to_sys_path:
    for path in args.add_to_sys_path.split(";"):
        print(f"Adding {path}")
        sys.path.insert(0, path)

# This script doesn't actually need poptorch, but pytest later on will import
# it while compiling the list of tests and if it fails then we usually don't
# get the reason (Because the collection happens in a subprocess).
import poptorch  # pylint: disable=unused-import,wrong-import-position

# Collect the list of tests:
list_tests = io.StringIO()
pytest_args = [
    "-x",
    args.test_dir,
    "--collect-only",
    "-q",
]

extra_args = []

if args.extra_pytest_args:
    arg = args.extra_pytest_args.replace("\"", "")
    if arg:
        extra_args = arg.split(",")
        pytest_args += extra_args

with contextlib.redirect_stdout(list_tests):
    retval = pytest.main(pytest_args)

assert retval == pytest.ExitCode.OK, f"{str(retval)}: {list_tests.getvalue()}"

# Run all the tests contained in these files in a single process
# because they're small / short to run (Under 1 minute)
# NB tests requring custom_ops libraries must go in here
#pylint: disable=line-too-long
# yapf: disable
short_tests = [
    "activations_test.py",
    "batching_test.py",
    "blas_test.py",
    "buffers_test.py",
    "custom_loss_test.py",
    "custom_ops_attributes_test.py",
    "custom_ops_test.py",
    "inputs_test.py",
    "loop_test.py",
    "if_test.py",
    "lstm_test.py",
    "non_contiguous_tensors_test.py",
    "ops_test.py",
    "options_test.py",
    "outputs_test.py",
    "pipelining_test.py",
    "poplar_executor_test.py",
    "precompilation_test.py",
    "random_sampling_test.py",
    "replicated_graph_test.py",
    "requires_grad_test.py",
    "sharding_test.py",
    "gnn/test_cluster_loader.py",
    "gnn/test_collate.py",
    "gnn/test_dataloader.py",
    "gnn/test_fixed_size_options.py",
    "gnn/test_masker.py",
    "gnn/test_model_args.py",
    "gnn/test_stream_packing_sampler.py",
]

# The only tests that should be run in doc-only builds.
docs_only_test_files = [
    "test_doc_urls.py"
]

long_tests = [
    "bert_small_and_medium_test.py::test_bert_medium_result",
    "half_test.py::test_resnet",
    "math_ops_test.py::test_sort[descending:True-shape1]"
    "math_ops_test.py::test_sort[descending:False-shape1]"
    "io_performance_test.py::test_compare_io_performance",
    "torch_nn_test.py::test_pytorch_nn[trace_model:False-use_half:False-test_name:test_nn_Conv2d_circular_stride2_pad2]",
    "torch_nn_test.py::test_pytorch_nn[trace_model:True-use_half:False-test_name:test_nn_Conv2d_circular_stride2_pad2]",
    "torchvision_inference_test.py::test_googlenet",
    "torchvision_inference_test.py::test_inception_v3",
    "torchvision_inference_test.py::test_mnasnet1_0",
    "torchvision_inference_test.py::test_mobilenet_v2",
    "torchvision_inference_test.py::test_resnet18",
    "torchvision_inference_test.py::test_resnext50_32x4d",
    "torchvision_inference_test.py::test_squeezenet1_1",
]

# Tests depending on external data being downloaded to run.
external_data_tests = [
    "bert_small_and_medium_test.py::test_bert_medium_result",
    "bert_small_and_medium_test.py::test_bert_small",
    "bert_small_and_medium_test.py::test_bert_small_half",
    "gnn/test_schnet.py",
]
# yapf: enable

# Tests that cannot run in parallel with other tests
# Note: these are files not, tests
serial_test_files = [
    "attach_detach_test.py",  # Needs specific IPUs
    "attach_detach_wait_for_ipu_test.py",  # Needs specific IPUs
    "fine_tuning_test.py",  # Takes too much memory for the AWS builders.
    "io_performance_test.py",  # Measures performance
]
serial_tests = [
    "half_test.py::test_resnet",
]
cpp_tests = ["cpp/GNNOptimizationsTest"]
#pylint: enable=line-too-long


def add_pytest(output, test, root_folder, folder, test_id, test_properties,
               extra_args):
    extra = " ".join([f"\"{a}\"" for a in extra_args])
    output.write(
        f"add_test({test} \"{root_folder}/timeout_handler.py\" \"python3\""
        f" \"-m\" \"pytest\" \"-sv\" \"{folder}/{test}\" "
        f"\"--external-datasets-dir={args.external_datasets_dir}\" "
        f"\"--junitxml=junit/junit-test{test_id}.xml\" {extra})\n")

    props_string = " ".join(f"{k} {v}" for k, v in test_properties.items())

    output.write(f"set_tests_properties({test} PROPERTIES\n{props_string})\n")


def add_cpp_test(output, test, root_folder, folder, test_properties):
    output.write(f"add_test({test} \"{root_folder}/timeout_handler.py\" "
                 f"\"{folder}/{test}\" )\n")

    props_string = " ".join(f"{k} {v}" for k, v in test_properties.items())

    output.write(f"set_tests_properties({test} PROPERTIES\n{props_string})\n")


work_dir = os.getcwd()

with open(args.output_file, "w") as output:
    test_id = 0
    # Add the short_tests files
    for test in short_tests:
        add_pytest(output, test, args.test_dir, args.test_dir, test_id, {
            "LABELS": "short",
            "WORKING_DIRECTORY": work_dir
        }, extra_args)
        test_id += 1

    # Process the list of tests returned by pytest
    for test in list_tests.getvalue().split("\n"):
        # Extract the file name from the test name
        m = re.match("^(.*)::(.*)", test)
        if m:
            test_properties = {"WORKING_DIRECTORY": work_dir}
            # Mark tests as timed out 1 second after TEST_TIMEOUT appears in
            # their output (see tests/timeout_handler.py)
            test_properties["TIMEOUT_AFTER_MATCH"] = "\"1;TEST_TIMEOUT\""
            # Use os.path.basename() to ensure we only have
            # the filename
            test_file = os.path.basename(m.group(1))

            dir_path = args.test_dir

            if os.path.dirname(m.group(1)) != "tests":
                # Convert to a proper path.
                path = os.path.normpath(m.group(1))

                # Seperate out the dirs and remove the "tests" from the start
                # and the test name from the end.
                separate_dirs = path.split(os.sep)[1:-1]

                # Append the dirs to the start of the root dir one.
                dir_path = os.path.join(dir_path, *separate_dirs)

            if test_file in short_tests:
                continue
            test_name = f"{test_file}::{m.group(2)}"
            labels = []
            if test_name in long_tests:
                labels.append("long")
            if test_name in external_data_tests:
                labels.append("external_data")
            if test_file in docs_only_test_files:
                labels.append("docs_only")

            if test_file in serial_test_files or test_name in serial_tests:
                test_properties['RUN_SERIAL'] = 'TRUE'

            if labels:
                test_properties['LABELS'] = ";".join(labels)

            add_pytest(output, test_name, args.test_dir, dir_path, test_id,
                       test_properties, extra_args)
            test_id += 1

    # Process the list of cpp tests
    for test in cpp_tests:
        test_properties = {"WORKING_DIRECTORY": work_dir}
        # Mark tests as timed out 1 second after TEST_TIMEOUT appears in
        # their output (see tests/timeout_handler.py)
        test_properties["TIMEOUT_AFTER_MATCH"] = "\"1;TEST_TIMEOUT\""
        # Use os.path.basename() to ensure we only have
        # the filename
        test_file = os.path.basename(test)

        dir_path = os.path.join(work_dir, "tests", test)
        dir_path = os.path.dirname(dir_path)

        add_cpp_test(output, test_file, args.test_dir, dir_path,
                     test_properties)


================================================
FILE: tests/gnn/.gitignore
================================================
.datasets


================================================
FILE: tests/gnn/benchgnn/README.md
================================================
# benchgnn

Benchmark tool for testing performance of GNN models

## Usage example

``benchgnn --dataset FakeDataset --model GAT --bs 1 100 --cpu --output outfile``

Type ``benchgnn --help`` to print detailed information about supported options.


================================================
FILE: tests/gnn/benchgnn/benchgnn.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import argparse
import json
import os.path as osp
import sys
from collections import namedtuple
from itertools import product, starmap
from warnings import warn

import torch
from datasets import DataSets
from models import GAT, GCN, GIN, PNA, RGCN, SAGE
from torch_geometric.loader import DataLoader
from torch_geometric.nn import PNAConv
from torch_geometric.transforms import Pad
from utils import all_formats, merge_results, print_results

import poptorch
from poptorch_geometric import TrainingStepper, set_aggregation_dim_size
from poptorch_geometric.dataloader import DataLoader as IPUDataLoader, FixedSizeDataLoader

supported_sets = {
    'Cora': [GCN, GAT, GIN, PNA, SAGE],
    'CiteSeer': [GCN, GAT, GIN, PNA, SAGE],
    'PubMed': [GCN, GAT, GIN, PNA, SAGE],
    'mutag': [RGCN],
    'FakeDataset': [GCN, GAT, GIN, PNA, SAGE],
}

all_models = list(set(m.__name__ for v in supported_sets.values() for m in v))
all_datasets = list(supported_sets.keys())
all_loaders = ['torch', 'poptorch', 'poptorch_fixed_size']
all_transforms = [None, 'Pad']

Config = namedtuple('Config', ['Model', 'ds', 'bs', 'loader', 'transform'])


def run_benchmark(args, configs):
    ipu_opts = poptorch.Options()
    if args['synthetic_data']:
        ipu_opts.enableSyntheticData(True)

    results = []
    for cfg in configs:
        if cfg.transform == 'Pad':
            max_num_nodes = args['max_num_nodes']
            max_num_edges = args['max_num_edges']
            assert max_num_nodes is not None and max_num_edges is not None

            cfg.ds.transform = Pad(max_num_nodes=max_num_nodes,
                                   max_num_edges=max_num_edges)
        if cfg.loader == 'torch':
            loader = DataLoader(cfg.ds, batch_size=cfg.bs, shuffle=False)
        elif cfg.loader == 'poptorch':
            loader = IPUDataLoader(cfg.ds, batch_size=cfg.bs)
        else:
            loader = FixedSizeDataLoader(dataset=cfg.ds,
                                         num_nodes=cfg.ds[0].num_nodes,
                                         batch_size=cfg.bs)

        d = next(iter(loader))

        params = {'loss_fn': torch.nn.MSELoss()}

        if cfg.Model.__name__ != 'GIN':
            params['out_channels'] = cfg.ds.num_classes

        if cfg.Model.__name__ == 'PNA':
            params['degree'] = PNAConv.get_degree_histogram(loader)

        if cfg.Model.__name__ == 'RGCN':
            batch = (d.edge_index, d.edge_type)
            params['in_channels'] = d.num_nodes
            params['num_relations'] = cfg.ds.num_relations
        else:
            batch = (d.x, d.edge_index)
            params['disable_dropout'] = args['check_values']
            params['in_channels'] = cfg.ds.num_features

        model = cfg.Model(**params)

        set_aggregation_dim_size(model, int(d.edge_index.max()) + 1)

        stepper = TrainingStepper(model,
                                  options=ipu_opts,
                                  enable_fp_exception=False)

        if args['check_values']:
            warn(
                'Models run without dropout layers. Turn off '
                'check-values to run the full model.', UserWarning)
            stepper.run(4, batch)

        devices = [dev for dev in ('cpu', 'gpu', 'ipu') if args[dev] is True]

        times = stepper.benchmark(args['iters'], batch, devices=devices)

        result = {
            'model': cfg.Model.__name__,
            'dataset': cfg.ds.name,
            '#features': cfg.ds.num_features,
            '#classes': cfg.ds.num_classes,
            '#nodes': getattr(d, 'num_nodes', d.x.size(0)),
            '#edges': getattr(d, 'num_edges', d.edge_index.size(1)),
            '#iters': args['iters'],
            'bs': cfg.bs,
            'dataloader': cfg.loader,
        }

        result.update(times)
        results.append(result)
    return results


def add_main_arguments(parser):
    main_group = parser.add_argument_group('Main')

    main_group.add_argument('--cfg',
                            type=str,
                            default=None,
                            metavar='file',
                            help="Configuration file")

    main_group.add_argument('--print-cfg',
                            type=str,
                            default=None,
                            metavar='file',
                            help="Show configuration file content")

    main_group.add_argument('--model',
                            nargs='+',
                            default=all_models,
                            help='Models to test')

    main_group.add_argument('--dataset',
                            nargs='+',
                            default=all_datasets,
                            help='Datasets to use for testing')

    main_group.add_argument('--ipu',
                            action='store_true',
                            default=True,
                            help="Run on IPU")

    main_group.add_argument('--cpu',
                            action='store_true',
                            default=False,
                            help="Run on CPU")

    main_group.add_argument('--gpu',
                            action='store_true',
                            default=False,
                            help="Run on GPU")

    main_group.add_argument('--iters',
                            type=int,
                            default=200,
                            help="Number of iterations")

    main_group.add_argument('--bs',
                            nargs='+',
                            default=[1],
                            type=int,
                            help="Number of graphs in batch.")

    main_group.add_argument('--check-values',
                            action='store_true',
                            default=False,
                            help='Run checks to make sure the results are'
                            'correct. Models run without dropout layers.')

    main_group.add_argument(
        '--synthetic-data',
        action='store_true',
        default=False,
        help='Use synthetic data on IPU (no data transfers to '
        'device)')

    main_group.add_argument(
        '--loader',
        nargs='+',
        default=['torch'],
        help=
        'Dataloader, possible values: [torch, poptorch, poptorch_fixed_size]')

    main_group.add_argument(
        '--transform',
        nargs='+',
        default=[None],
        help='Dataloader, possible values: [None, Pad]. Pass the required '
        'transformation parameters, for example: --max-num-nodes=30')

    main_group.add_argument('--fmt',
                            type=str,
                            default='rounded_outline',
                            help=f'Output format, one of: {all_formats}')

    main_group.add_argument(
        '--output',
        type=str,
        default=None,
        help='Store JSON output file with configuration and '
        'results. You can load such file later using '
        '--cfg option.')

    transform_group = parser.add_argument_group('Arguments for Pad transform')
    transform_group.add_argument(
        '--max-num-nodes',
        type=int,
        default=None,
        help='Pad transform argument. The number of nodes after padding')
    transform_group.add_argument(
        '--max-num-edges',
        type=int,
        default=None,
        help='Pad transform argument. The edges of nodes after padding')
    return parser


def get_args():
    help_formatter = argparse.ArgumentDefaultsHelpFormatter
    parser = argparse.ArgumentParser(description="Whatever comes here "
                                     "...",
                                     add_help=True,
                                     formatter_class=help_formatter)
    parser = add_main_arguments(parser)
    args, unknown = parser.parse_known_args()
    assert len(unknown) == 0, f'Unknown options {unknown}'

    args = vars(args)
    loaded_args = {}

    cfg_file = args['print_cfg'] or args['cfg']
    if cfg_file is not None:
        with open(cfg_file, "r") as infile:
            loaded_args_ = json.load(infile)
            loaded_args.update(loaded_args_)

            # Override some of the loaded args with cmd-line args
            # Can't override those args that define a test set
            overwrite_args = [
                'synthetic_data', 'check_values', 'output', 'cfg', 'print_cfg',
                'ipu', 'cpu', 'gpu'
            ]
            for arg in overwrite_args:
                loaded_args_[arg] = args[arg]
            args = loaded_args_

    assert all(d in all_datasets for d in args['dataset']), 'Unknown dataset'
    assert all(m in all_models for m in args['model']), 'Unknown model'
    assert all(ld in all_loaders
               for ld in args['loader']), 'Unknown dataloader'
    assert all(t in all_transforms
               for t in args['transform']), 'Unknown transform'

    return args, loaded_args


def print_cfg_and_results(args, loaded_args, loaded_results):
    print(f'\nArgs loaded from {args["print_cfg"]}:')
    print(loaded_args)
    print(f'\nResults loaded from {args["print_cfg"]}:')
    print_results(loaded_results, args['fmt'])


def get_tst_configs(args):
    root = osp.join(osp.dirname(osp.realpath(__file__)), 'test_data')
    datasets = DataSets(root)
    datasets = [getattr(datasets, name)() for name in args['dataset']]
    models = [globals()[name] for name in args['model']]
    batch_sizes = args['bs']
    loaders = args['loader']
    transforms = args['transform']

    configs = starmap(
        Config, product(models, datasets, batch_sizes, loaders, transforms))

    def is_supported(cfg):
        return cfg.Model in supported_sets[cfg.ds.name]

    configs = filter(is_supported, configs)
    return configs


def save_cfg_and_results(args, results):
    with open(args['output'], "w") as outfile:
        args['results'] = results
        json.dump(args, outfile, indent=4)


if __name__ == '__main__':
    args, loaded_args = get_args()
    loaded_results = loaded_args.get('results', None)
    loaded_args['results'] = None
    if args['print_cfg']:
        print_cfg_and_results(args, loaded_args, loaded_results)
        sys.exit()

    configs = get_tst_configs(args)

    results = run_benchmark(args, configs)

    if args['output'] is not None:
        save_cfg_and_results(args, results)

    if loaded_results:
        results = merge_results(results, loaded_results)

    print_results(results, args['fmt'])


================================================
FILE: tests/gnn/benchgnn/datasets.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.

import os.path as osp

from torch_geometric import seed_everything
from torch_geometric.datasets import Entities
from torch_geometric.datasets import FakeDataset as FDS
from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import Compose, GCNNorm, NormalizeFeatures


class DataSets:
    def __init__(self, root):
        self.root = root

    def Cora(self):
        return Planetoid(osp.join(self.root, 'Cora'), 'Cora')

    def CiteSeer(self):
        return Planetoid(osp.join(self.root, 'CiteSeer'), 'CiteSeer')

    def PubMed(self):
        return Planetoid(osp.join(self.root, 'PubMed'), 'PubMed')

    def mutag(self):
        return Entities(osp.join(self.root, 'EntitiesMUTAG'), 'mutag')

    def FakeDataset(self):
        seed_everything(0)

        transform = Compose([GCNNorm(), NormalizeFeatures()])

        dataset = FDS(
            num_graphs=1000,
            avg_num_nodes=16,
            avg_degree=5,
            transform=transform,
            num_channels=64,
        )
        setattr(dataset, 'name', 'FakeDataset')
        return dataset


================================================
FILE: tests/gnn/benchgnn/models.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import torch
import torch.nn.functional as F
from torch.nn import Linear as Lin
from torch.nn import ReLU
from torch.nn import Sequential as Seq
from torch_geometric.nn import (FastRGCNConv, GATConv, GCNConv, GINConv,
                                PNAConv, SAGEConv)


class GCN(torch.nn.Module):
    def __init__(self,
                 in_channels=0,
                 out_channels=0,
                 loss_fn=None,
                 disable_dropout=False):
        super().__init__()
        self.conv1 = GCNConv(in_channels, 32, add_self_loops=False)
        self.conv2 = GCNConv(32, out_channels, add_self_loops=False)
        self.loss_fn = loss_fn
        self.disable_dropout = disable_dropout

    def forward(self, *args):
        x, edge_index = args
        x = F.relu(self.conv1(x, edge_index))
        if not self.disable_dropout:
            x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.log_softmax(x, dim=1)

        if self.training:
            target = torch.ones_like(x)
            loss = self.loss_fn(x, target)
            return x, loss

        return x


class GIN(torch.nn.Module):
    def __init__(self, in_channels=0, loss_fn=None, disable_dropout=False):
        super().__init__()
        nn1 = Seq(Lin(in_channels, 32), ReLU(), Lin(32, 32))
        self.conv1 = GINConv(nn1, train_eps=True)
        nn2 = Seq(Lin(32, 32), ReLU(), Lin(32, 32))
        self.conv2 = GINConv(nn2, train_eps=True)
        self.loss_fn = loss_fn
        self.disable_dropout = disable_dropout

    def forward(self, *args):
        x, edge_index = args
        x = F.relu(self.conv1(x, edge_index))
        if not self.disable_dropout:
            x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.log_softmax(x, dim=1)

        if self.training:
            target = torch.ones_like(x)
            loss = self.loss_fn(x, target)
            return x, loss

        return x


class GAT(torch.nn.Module):
    def __init__(self,
                 in_channels=0,
                 out_channels=0,
                 loss_fn=None,
                 disable_dropout=False):
        super().__init__()
        dropout_val = 0 if disable_dropout else 0.6
        self.conv1 = GATConv(in_channels,
                             8,
                             heads=8,
                             dropout=dropout_val,
                             add_self_loops=False)
        self.conv2 = GATConv(8 * 8,
                             out_channels,
                             dropout=dropout_val,
                             add_self_loops=False)
        self.loss_fn = loss_fn
        self.disable_dropout = disable_dropout

    def forward(self, *args):
        x, edge_index = args
        if not self.disable_dropout:
            x = F.dropout(x, p=0.6, training=self.training)
        x = F.elu(self.conv1(x, edge_index))
        if not self.disable_dropout:
            x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.log_softmax(x, dim=1)

        if self.training:
            target = torch.ones_like(x)
            loss = self.loss_fn(x, target)
            return x, loss

        return x


class RGCN(torch.nn.Module):
    def __init__(self,
                 in_channels=0,
                 out_channels=0,
                 num_relations=0,
                 loss_fn=None):

        super().__init__()
        self.conv1 = FastRGCNConv(in_channels,
                                  8,
                                  num_relations,
                                  num_bases=15,
                                  add_self_loops=False)
        self.conv2 = FastRGCNConv(8,
                                  out_channels,
                                  num_relations,
                                  num_bases=15,
                                  add_self_loops=False)
        self.loss_fn = loss_fn

    def forward(self, *args):
        edge_index, edge_type = args
        x = F.relu(self.conv1(None, edge_index, edge_type))
        x = self.conv2(x, edge_index, edge_type)
        x = F.log_softmax(x, dim=1)

        if self.training:
            target = torch.ones_like(x)
            loss = self.loss_fn(x, target)
            return x, loss

        return x


class PNA(torch.nn.Module):
    def __init__(self,
                 in_channels=0,
                 out_channels=0,
                 loss_fn=None,
                 disable_dropout=False,
                 degree=None):

        super().__init__()
        aggregators = ['mean', 'min', 'max', 'std']
        scalers = ['identity', 'amplification', 'attenuation']
        self.conv = PNAConv(in_channels,
                            out_channels,
                            aggregators,
                            scalers,
                            deg=degree,
                            add_self_loops=False)
        self.loss_fn = loss_fn
        self.disable_dropout = disable_dropout

    def forward(self, *args):
        x, edge_index = args
        x = self.conv(x, edge_index)
        if not self.disable_dropout:
            x = F.dropout(x, training=self.training)

        if self.training:
            target = torch.ones_like(x)
            loss = self.loss_fn(x, target)
            return x, loss

        return x


class SAGE(torch.nn.Module):
    def __init__(self,
                 in_channels=0,
                 out_channels=0,
                 loss_fn=None,
                 disable_dropout=False):
        super().__init__()
        self.conv = SAGEConv(in_channels, out_channels, add_self_loops=False)
        self.loss_fn = loss_fn
        self.disable_dropout = disable_dropout

    def forward(self, *args):
        x, edge_index = args
        x = self.conv(x, edge_index)
        if not self.disable_dropout:
            x = F.dropout(x, training=self.training)

        if self.training:
            target = torch.ones_like(x)
            loss = self.loss_fn(x, target)
            return x, loss

        return x


================================================
FILE: tests/gnn/benchgnn/requirements.txt
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
pytest-benchmark
pytest-cov
nbformat
nbconvert
pandas
rdflib
tabulate


================================================
FILE: tests/gnn/benchgnn/utils.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.

from tabulate import _table_formats, tabulate

all_formats = sorted(list(_table_formats.keys()))


def merge_results(results, prev_results):
    if prev_results:
        keys = prev_results[0].keys()
        time_keys = {'ipu_time', 'gpu_time', 'cpu_time'}.intersection(keys)
        for curr, prev in zip(results, prev_results):
            for key in time_keys:
                curr['prev_' + key.split('_')[0]] = prev[key]
    return results


def include_speedups_ratio(results):
    keys = list(results[0].keys())

    # Calculate speedup over other times
    if 'ipu_time' in keys:
        other = filter(lambda x: x in keys,
                       ('cpu_time', 'prev_cpu', 'prev_gpu', 'prev_ipu'))
        for t in other:
            for res in results:
                res['ipu/' + t] = res[t] / res["ipu_time"]

    return results


def print_results(results, format):
    results = include_speedups_ratio(results)

    content = [list(results[0].keys())]
    prev_model = None
    for res in results:
        curr_model = res['model']
        if prev_model != curr_model:
            if prev_model is not None:
                content.append([])
            prev_model = curr_model
        else:
            res['model'] = ''

        row = [f'{x:.2f}' if isinstance(x, float) else x for x in res.values()]

        content.append(row)

    body = tabulate(content, headers='firstrow', tablefmt=format)
    print('\n', body, sep='')


================================================
FILE: tests/gnn/benchgnn_ops/README.md
================================================
# benchgnn

Benchmark tool for testing performance of GNN operators

## Usage example

Running single benchmark test case scenario from command line:
``python3 benchgnn_ops.py --num_sample_rounds 10 scatter --src_shape [1,12] --input_shape [1,12] --index_shape [1,12] --dim 0``

Running multiple benchmark test case scenarios from yaml configuration files from given directory:
``python3 benchgnn_ops.py --common_config=example_configs/common.yaml --config_dir=example_configs``

Running multiple benchmark test case scenarios from given yaml configuration files:
``python3 benchgnn_ops.py --common_config=example_configs/common.yaml --config_files=[example_configs/scatter_testcase1.yaml,example_configs/scatter_testcase2.yaml]``

Running multiple benchmark test case scenarios - combining all available options:
``python3 benchgnn_ops.py --common_config=example_configs/common.yaml --config_dir=example_configs --config_files=[example_configs/scatter_testcase1.yaml,example_configs/scatter_testcase2.yaml] scatter --src_shape [1,12] --input_shape [1,12] --index_shape [1,12] --dim 0``

Type ``python3 benchgnn_ops.py --help`` to print detailed information about supported options.


================================================
FILE: tests/gnn/benchgnn_ops/benchgnn_ops.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import os
from typing import List, Tuple

import jsonargparse
import pandas as pd
import torch
from builder import BenchModelBuilder
from metrics import PerfMetrics, to_data_frame
from ops import bench_ops
from tqdm import tqdm


def prepare_parser() -> jsonargparse.ArgumentParser:
    jsonargparse.set_docstring_parse_options(attribute_docstrings=True)
    jsonargparse.typing.register_type(torch.Size, torch.Size, torch.Size)

    parser = jsonargparse.ArgumentParser(prog='GNN Ops Benchmark')
    parser.add_class_arguments(BenchModelBuilder, 'compile_options')

    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='the random seed to use.')
    parser.add_argument(
        '--num_warmup_rounds',
        type=int,
        default=4,
        help='num_warmup_rounds: initial set of runs to discard.')
    parser.add_argument(
        '--num_sample_rounds',
        type=int,
        default=1,
        help='num_sample_rounds: the number of runs used to average the '
        'runtime.')
    parser.add_argument(
        '--calc_samples_mean',
        type=bool,
        default=True,
        help='calculate mean over collected `num_sample_rounds` measurements.')
    parser.add_argument(
        '--clock',
        type=int,
        default=None,
        help='manually override clock value (Mhz) read by gcipuinfo.')
    parser.add_argument(
        '--common_config',
        type=jsonargparse.typing.Path_fr,
        default=None,
        help='yaml file containing configuration options shared between all '
        'benchmark testcases.')
    parser.add_argument(
        '--config_dir',
        type=jsonargparse.typing.Path_drw,
        default=None,
        help='directory with a set of yaml benchmark test case scenario'
        'files.')
    parser.add_argument(
        '--config_files',
        type=List[jsonargparse.typing.Path_fr],
        default=None,
        help='set of yaml file paths containig benchmark test case scenarios.')

    subcommands = parser.add_subcommands(required=False, dest='operation')

    for command, op_type in bench_ops.items():
        subparser = jsonargparse.ArgumentParser()
        subparser.add_class_arguments(op_type)
        subcommands.add_subcommand(command, subparser)

    return parser


def collect_measurements(config_src: str,
                         testcase_config: jsonargparse.namespace.Namespace
                         ) -> pd.DataFrame:
    torch.manual_seed(testcase_config.seed)
    op_name = testcase_config.operation
    op_params = getattr(testcase_config, op_name)

    operator = bench_ops[op_name](**op_params.as_dict())
    builder = BenchModelBuilder(**testcase_config.compile_options.as_dict())
    compiled_model = builder.create_model(operator)
    metrics = PerfMetrics(config_src, operator,
                          testcase_config.compile_options.num_repeats, op_name,
                          str(op_params), testcase_config.clock)

    for _ in range(testcase_config.num_warmup_rounds):
        _ = compiled_model()

    measurements = []
    for _ in range(testcase_config.num_sample_rounds):
        _ = compiled_model()
        measurements.append(
            metrics.get_measurement(compiled_model.cycleCount()))

    return to_data_frame(measurements, testcase_config.calc_samples_mean)


def run_benchmark(testcases: List[Tuple[str, jsonargparse.namespace.Namespace]]
                  ) -> pd.DataFrame:
    bar = tqdm(range(len(testcases)),
               desc="Benchmarking progress",
               unit="testcase",
               position=3)

    data_frames = []
    for testcase_config in testcases:
        data_frames.append(collect_measurements(*testcase_config))
        bar.update()
        bar.refresh()
    bar.clear()
    bar.close()

    return pd.concat(data_frames, ignore_index=True)


def set_defaults_from_yaml_config(
        parser: jsonargparse.ArgumentParser,
        common_config_path: jsonargparse.typing.Path_fr) -> None:
    common_config_raw = parser.parse_path(common_config_path, defaults=False)
    parser.set_defaults(**dict(common_config_raw.as_flat()._get_kwargs()))  # pylint: disable=protected-access


def set_defaults_from_user_params(parser: jsonargparse.ArgumentParser,
                                  user_params: jsonargparse.namespace.Namespace
                                  ) -> None:

    default_params = user_params.clone()
    if 'operation' in default_params:
        op = default_params['operation']
        del default_params[op]
        del default_params['operation']

    parser.set_defaults(**dict(default_params.as_flat()._get_kwargs()))  # pylint: disable=protected-access


def set_defaults(parser: jsonargparse.ArgumentParser,
                 user_params: jsonargparse.namespace.Namespace) -> None:
    common_config_path = None
    if 'common_config' in user_params:
        common_config_path = user_params.common_config.abs_path
        set_defaults_from_yaml_config(parser, common_config_path)

    set_defaults_from_user_params(parser, user_params)


def get_test_case_config_paths(user_params: jsonargparse.namespace.Namespace
                               ) -> List[str]:
    test_case_config_paths = []

    common_config_path = None
    if 'common_config' in user_params:
        common_config_path = user_params.common_config.abs_path

    def is_valid_path(path: str) -> bool:
        return os.path.isfile(path) and path != common_config_path

    if 'config_dir' in user_params:
        base_dir = user_params.config_dir.abs_path

        for filename in os.listdir(base_dir):
            file_path = os.path.join(base_dir, filename)
            if is_valid_path(file_path):
                test_case_config_paths.append(file_path)

    if 'config_files' in user_params:
        for file_path in user_params.config_files:
            file_abs_path = file_path.abs_path
            if is_valid_path(file_abs_path):
                test_case_config_paths.append(file_abs_path)

    return test_case_config_paths


def parse_test_case_config_files(test_case_config_paths: List[str]
                                 ) -> List[jsonargparse.namespace.Namespace]:
    test_case_configs = []
    for file_path in test_case_config_paths:
        try:
            test_case_configs.append((
                os.path.basename(file_path),
                parser.parse_path(file_path),
            ))
        except Exception as e:
            print(f'Parsing {file_path} failed.')
            raise e
    return test_case_configs


def get_test_case_configs(parser: jsonargparse.ArgumentParser,
                          user_params: jsonargparse.namespace.Namespace
                          ) -> List[jsonargparse.namespace.Namespace]:
    test_case_configs = []

    if 'operation' in user_params:
        test_case_configs.append((
            'cmd',
            parser.parse_args(defaults=True),
        ))

    config_paths = get_test_case_config_paths(user_params)
    test_case_configs.extend(parse_test_case_config_files(config_paths))

    return test_case_configs


if __name__ == "__main__":
    parser = prepare_parser()

    user_params = parser.parse_args(defaults=False)
    set_defaults(parser, user_params)
    test_case_configs = get_test_case_configs(parser, user_params)

    if test_case_configs:
        results = run_benchmark(test_case_configs)
        print(results.to_string())
    else:
        print('No test cases to benchmark. Please check `python3 '
              'benchgnn_ops.py --help`.')


================================================
FILE: tests/gnn/benchgnn_ops/builder.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
from typing import Optional

import torch

import poptorch


class BenchModel(torch.nn.Module):
    def __init__(self, operator: torch.nn.Module, num_repeats: int) -> None:
        super().__init__()
        self.num_repeats = num_repeats
        self.operator = operator

    def forward(self) -> torch.Tensor:
        return poptorch.for_loop(self.num_repeats, self.operator,
                                 self.operator.loop_inputs())[-1]


def _create_poptorch_options(
        synthetic_data: bool = 0,
        available_memory_proportion: Optional[float] = None,
        profile_dir: Optional[str] = None,
        cache_dir: str = 'benchgnn_model_cache') -> poptorch.Options:
    options = poptorch.Options()
    options.enableSyntheticData(synthetic_data)
    options.logCycleCount(True)
    options.enableExecutableCaching(cache_dir)
    options.connectionType(poptorch.ConnectionType.OnDemand)

    if available_memory_proportion is not None:
        amp_dict = {"IPU0": available_memory_proportion}
        options.setAvailableMemoryProportion(amp_dict)

    if profile_dir:
        options.enableProfiling(profile_dir)
    return options


class BenchModelBuilder():
    def __init__(self,
                 synthetic_data: bool = False,
                 available_memory_proportion: Optional[float] = None,
                 profile_dir: Optional[str] = None,
                 cache_dir: str = 'benchgnn_model_cache',
                 num_repeats: int = 128) -> None:
        """
        model compile options

        Args:
            synthetic_data (bool, optional): Use synthetic data on the device
                to disable I/O. (default: :obj:`False`)
            available_memory_proportion (float, optional): the AMP budget
                used for planning ops. (default: :obj:`None`)
            profile_dir (str, optional): saves the profiling report to the
                provided location. (default: :obj:`None`)
            cache_dir (str, optional): saves the executable cache to the
                provided location. (default: :obj:`benchgnn_model_cache`)
            num_repeats (int, optional): the number of times to invoke the
                operator on device. (default: :obj:`128`)
        """
        self.num_repeats = num_repeats
        self.options = _create_poptorch_options(synthetic_data,
                                                available_memory_proportion,
                                                profile_dir, cache_dir)

    def create_model(self, operator: torch.nn.Module):
        model = BenchModel(num_repeats=self.num_repeats, operator=operator)
        pop_model = poptorch.inferenceModel(model, options=self.options)
        pop_model.compile()
        return pop_model


================================================
FILE: tests/gnn/benchgnn_ops/example_configs/common.yaml
================================================
num_sample_rounds: 25
compile_options:
  num_repeats: 100


================================================
FILE: tests/gnn/benchgnn_ops/example_configs/scatter_testcase1.yaml
================================================
scatter:
  src_shape: [1,120]
  input_shape: [1,120]
  index_shape: [1,120]
  dim: 0


================================================
FILE: tests/gnn/benchgnn_ops/example_configs/scatter_testcase2.yaml
================================================
scatter:
  src_shape: [1,12]
  input_shape: [1,12]
  index_shape: [1,12]
  dim: 0


================================================
FILE: tests/gnn/benchgnn_ops/metrics.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import warnings
from typing import Any, Dict, List, Optional

try:
    import gcipuinfo  # type: ignore
except ImportError:
    gcipuinfo = None
import pandas as pd
import torch


def _get_clock_value() -> int:
    if gcipuinfo is None:
        default_clock_value = 1850
        warnings.warn('Unable to import gcipuinfo. Using default value '
                      f'{default_clock_value} MHz')
        return default_clock_value
    try:
        clock = int(gcipuinfo.gcipuinfo().getDevices()[0]['clock'][:-3])
    except Exception as e:
        raise RuntimeError(
            'Getting clock frequency using gcipuinfo failed') from e
    return clock


def _mean(col: pd.core.series.Series) -> Any:
    if pd.api.types.is_numeric_dtype(col):
        mean = col.mean()
        if col.name == 'cycles' or col.name == 'clock (MHz)':
            mean = mean.astype('int64')
        return mean

    return col.unique()


def to_data_frame(measurements: List[Dict[str, Any]],
                  calc_mean=False) -> pd.DataFrame:
    data_frame = pd.DataFrame(measurements)

    if calc_mean:
        return data_frame.agg(_mean)

    return data_frame


class PerfMetrics:
    r"""Track performance metrics from:
        * recorded number of cycles
        * sizes of input / output
    Defines an effective bandwidth from the size of the output result.
    """

    def __init__(self,
                 config_src: str,
                 operator: torch.nn.Module,
                 num_repeats: int,
                 op_name: str,
                 op_params: str,
                 clock: Optional[int] = None) -> None:
        output = operator.output
        numels = output.numel()
        numbytes = torch.finfo(output.dtype).bits // 8
        self.out_gib = numels * numbytes / 1024**3
        self.num_repeats = num_repeats
        self.clock = _get_clock_value() if clock is None else clock
        self.op_name = op_name
        self.op_params = op_params
        self.config_src = config_src

    def get_measurement(self, cycles: int) -> Dict[str, Any]:

        avg_cycles = cycles / self.num_repeats
        time_us = avg_cycles / self.clock
        time_s = time_us * 10**-6
        effective_bandwidth = self.out_gib / time_s

        return {
            'operator': self.op_name,
            'cycles': avg_cycles,
            'clock (MHz)': self.clock,
            'time (us)': time_us,
            'effective bandwidth (GiB/s)': effective_bandwidth,
            'parameters': self.op_params,
            'config source': self.config_src
        }


================================================
FILE: tests/gnn/benchgnn_ops/ops.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
from typing import List, Tuple

import torch


class ScatterOp(torch.nn.Module):
    def __init__(self, dim: int, input_shape: torch.Size,
                 index_shape: torch.Size, src_shape: torch.Size) -> None:
        """Scatter Op.

        Args:
            dim (int): the axis along which to index.
            input_shape (torch.Size): the scatter input tensor shape.
            index_shape (torch.Size): the indices shape of elements to scatter.
            src_shape (torch.Size): the source element(s) shape to scatter.
        """
        super().__init__()

        self.dim = dim
        input = torch.randn(*input_shape)
        index = torch.randint(input_shape[dim], index_shape)
        src = torch.randn(*src_shape)
        self.register_buffer('input', input)
        self.register_buffer('index', index)
        self.register_buffer('src', src)
        self.register_buffer('output', self(input, index, src, None)[-1])

    def loop_inputs(self) -> List[torch.Tensor]:
        return [self.input, self.index, self.src, self.output]

    def forward(
            self,
            input: torch.tensor,
            index: torch.tensor,
            src: torch.tensor,
            output: torch.tensor  # pylint: disable=unused-argument
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        return input, index, src, torch.scatter(input, self.dim, index, src)


class ScatterReduceOp(torch.nn.Module):
    def __init__(self,
                 dim: int,
                 input_shape: torch.Size,
                 index_shape: torch.Size,
                 src_shape: torch.Size,
                 reduce: str,
                 include_self: bool = True) -> None:
        """ScatterReduce Op.

        Args:
            dim (int): the axis along which to index.
            input_shape (torch.Size): the scatter input tensor shape.
            index_shape (torch.Size): the indices shape of elements to scatter.
            src_shape (torch.Size): the source element(s) shape to scatter.
            reduce (str): the reduction operation to apply for non-unique
                indices. ("sum", "prod", "mean", "amax", "amin")
            include_self (bool, optional): whether elements from the self
                tensor are included in the reduction. (default: :obj:`True`)
        """
        super().__init__()

        self.dim = dim
        self.reduce = reduce
        self.include_self = include_self
        input = torch.randn(*input_shape)
        index = torch.randint(input_shape[dim], index_shape)
        src = torch.randn(*src_shape)
        self.register_buffer('input', input)
        self.register_buffer('index', index)
        self.register_buffer('src', src)
        self.register_buffer('output', self(input, index, src, None)[-1])

    def loop_inputs(self) -> List[torch.Tensor]:
        return [self.input, self.index, self.src, self.output]

    def forward(
            self,
            input: torch.tensor,
            index: torch.tensor,
            src: torch.tensor,
            output: torch.tensor  # pylint: disable=unused-argument
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        return input, index, src, input.scatter_reduce(
            self.dim, index, src, self.reduce, include_self=self.include_self)


class IndexReduceOp(torch.nn.Module):
    def __init__(self,
                 dim: int,
                 input_shape: torch.Size,
                 index_shape: torch.Size,
                 src_shape: torch.Size,
                 reduce: str,
                 include_self: bool = True) -> None:
        """IndexReduce Op.

        Args:
            dim: the axis along which to index.
            input_shape: the index reduce input tensor shape.
            index_shape: the indices shape of elements to select from.
            src_shape: the source element(s) shape.
            reduce: the reduction operation to apply ("prod",
                    "mean", "amax", "amin")
            include_self: whether elements from the self tensor are included in
                          the reduction
        """
        super().__init__()

        self.dim = dim
        self.reduce = reduce
        self.include_self = include_self
        input = torch.randn(*input_shape)
        index = torch.randint(input_shape[dim], index_shape)
        src = torch.randn(*src_shape)
        self.register_buffer('input', input)
        self.register_buffer('index', index)
        self.register_buffer('src', src)
        self.register_buffer('output', self(input, index, src, None)[-1])

    def loop_inputs(self) -> List[torch.Tensor]:
        return [self.input, self.index, self.src, self.output]

    def forward(
            self: torch.tensor,
            input: torch.tensor,
            index: torch.tensor,
            src: torch.tensor,
            output: torch.tensor  # pylint: disable=unused-argument
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        return input, index, src, input.index_reduce_(
            self.dim, index, src, self.reduce, include_self=self.include_self)


class IndexSelectOp(torch.nn.Module):
    def __init__(self, dim: int, input_shape: torch.Size,
                 index_size: int) -> None:
        """IndexSelect Op.

        Args:
            dim: the axis along which to index.
            input_shape: the input tensor shape.
            index_size: the indices size.
        """

        super().__init__()
        self.dim = dim
        input = torch.randn(*input_shape)
        index = torch.randint(input_shape[dim], (index_size, ))
        self.register_buffer('input', input)
        self.register_buffer('index', index)
        self.register_buffer('output', self(input, index, None)[-1])

    def loop_inputs(self) -> List[torch.Tensor]:
        return [self.input, self.index, self.output]

    def forward(self, input: torch.tensor, index: torch.tensor, _: torch.tensor
                ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        return input, index, input.index_select(dim=self.dim, index=index)


class GatherOp(torch.nn.Module):
    def __init__(self, dim: int, input_shape: torch.Size,
                 index_shape: torch.Size) -> None:
        """Gather Op.

        Args:
            dim: the axis along which to index.
            input_shape: the scatter input tensor shape.
            index_shape: the indices shape of elements to gather.
        """

        super().__init__()
        self.dim = dim
        input = torch.randn(*input_shape)
        index = torch.randint(input_shape[dim], index_shape)
        self.register_buffer('input', input)
        self.register_buffer('index', index)
        self.register_buffer('output', self(input, index, None)[-1])

    def loop_inputs(self) -> List[torch.Tensor]:
        return [self.input, self.index, self.output]

    def forward(self: torch.tensor, input: torch.tensor, index: torch.tensor,
                _: torch.tensor
                ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        return input, index, torch.gather(input, self.dim, index)


bench_ops = {
    'gather': GatherOp,
    'index_reduce': IndexReduceOp,
    'index_select': IndexSelectOp,
    'scatter': ScatterOp,
    'scatter_reduce': ScatterReduceOp
}


================================================
FILE: tests/gnn/benchgnn_ops/requirements.txt
================================================
jsonargparse==4.19.0
docstring-parser==0.15
tqdm==4.64.1


================================================
FILE: tests/gnn/conftest.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import os.path as osp
import pytest
import torch_geometric as pyg


@pytest.fixture(scope="module")
def pyg_qm9(pytestconfig):
    qm9root = osp.join(pytestconfig.getoption("external_datasets_dir"), "qm9")
    if not osp.exists(qm9root):
        raise RuntimeError(f'Path {qm9root} not exists.')
    return pyg.datasets.QM9(root=qm9root)


@pytest.fixture(scope="module")
def planetoid_cora(pytestconfig):
    planetoid_root = osp.join(pytestconfig.getoption("external_datasets_dir"),
                              "planetoid")
    if not osp.exists(planetoid_root):
        raise RuntimeError(f'Path {planetoid_root} not exists.')
    return pyg.datasets.Planetoid(planetoid_root,
                                  "Cora",
                                  transform=pyg.transforms.NormalizeFeatures())


@pytest.fixture(scope="module")
def molecule(pyg_qm9):
    # The index of the largest molecule in the QM9 dataset, which looks like:
    # Data(edge_attr=[56, 4], edge_index=[2, 56], idx=[1], name="gdb_57518",
    #      pos=[29, 3], x=[29, 11], y=[1, 19], z=[29])
    max_index = 55967
    return pyg_qm9[max_index]


@pytest.fixture(scope="module")
def fake_small_dataset() -> pyg.datasets.FakeDataset:
    pyg.seed_everything(42)
    dataset = pyg.datasets.FakeDataset(num_graphs=10,
                                       avg_num_nodes=30,
                                       avg_degree=5)
    return dataset


@pytest.fixture(scope="module")
def fake_large_dataset() -> pyg.datasets.FakeDataset:
    pyg.seed_everything(42)
    dataset = pyg.datasets.FakeDataset(num_graphs=100, avg_num_nodes=10)
    return dataset


@pytest.fixture(scope="module")
def fake_node_task_dataset() -> pyg.datasets.FakeDataset:
    pyg.seed_everything(42)
    dataset = pyg.datasets.FakeDataset(num_graphs=500,
                                       avg_num_nodes=10,
                                       task='node')
    return dataset


@pytest.fixture(scope="module")
def fake_hetero_dataset() -> pyg.datasets.FakeHeteroDataset:
    pyg.seed_everything(1410)
    dataset = pyg.datasets.FakeHeteroDataset(num_graphs=100,
                                             num_node_types=2,
                                             num_edge_types=5,
                                             avg_num_nodes=50)
    return dataset


@pytest.fixture(scope="module")
def fake_node_task_hetero_dataset() -> pyg.datasets.FakeHeteroDataset:
    pyg.seed_everything(1410)
    dataset = pyg.datasets.FakeHeteroDataset(num_graphs=100,
                                             num_node_types=2,
                                             num_edge_types=5,
                                             avg_num_nodes=50,
                                             task='node')
    return dataset


@pytest.fixture(scope="module")
def fake_hetero_data() -> pyg.datasets.FakeHeteroDataset:
    pyg.seed_everything(1410)
    dataset = pyg.datasets.FakeHeteroDataset(num_graphs=1,
                                             num_node_types=2,
                                             num_edge_types=5,
                                             avg_num_nodes=50)
    return dataset[0]


@pytest.fixture(scope="module")
def fake_molecular_dataset() -> pyg.datasets.FakeDataset:
    # setup a dataset which looks like a molecular dataset.
    pyg.seed_everything(42)
    avg_num_nodes = 20
    avg_degree = 3
    dataset = pyg.datasets.FakeDataset(
        num_graphs=1000,
        avg_num_nodes=avg_num_nodes,
        avg_degree=avg_degree,
        num_channels=20,
        task="graph",
    )
    return dataset


================================================
FILE: tests/gnn/nn/aggr/aggr_utils.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

from typing import List
import torch
import torch_geometric

from poptorch_geometric import TrainingStepper


def aggr_harness(aggr,
                 dim_size,
                 dataloader=None,
                 post_proc=None,
                 sorted_index=False,
                 loss_fn=torch.nn.MSELoss(),
                 num_steps=4,
                 atol=5e-3,
                 rtol=5e-3,
                 equal_nan=False,
                 enable_fp_exception=True):
    class AggrWrapper(torch.nn.Module):
        def __init__(self, aggr, loss_fn, post_proc=None):
            assert hasattr(loss_fn, 'reduction')
            # No support for other reduction types yet
            assert loss_fn.reduction in ('sum', 'mean')

            super().__init__()
            self.aggr = aggr
            self.loss_fn = loss_fn
            self.post_proc = post_proc
            self.mean_reduction_in_loss = (loss_fn.reduction == 'mean')

        def forward(self, *args):
            x = args[0]
            edge_index = args[1]
            nodes_mask = args[2]
            size = args[3]

            broadcast_index = edge_index[1] if sorted_index else edge_index[0]
            aggr_index = edge_index[0] if sorted_index else edge_index[1]

            x_broadcasted = torch.index_select(x, 0, broadcast_index)
            kwargs = {}
            if isinstance(self.aggr,
                          (torch_geometric.nn.aggr.SortAggregation,
                           torch_geometric.nn.aggr.GRUAggregation,
                           torch_geometric.nn.aggr.GraphMultisetTransformer,
                           torch_geometric.nn.aggr.SetTransformerAggregation,
                           torch_geometric.nn.aggr.LSTMAggregation)):
                kwargs["max_num_elements"] = size

            result = self.aggr(x_broadcasted,
                               aggr_index,
                               dim_size=size,
                               **kwargs)

            if self.post_proc is not None:
                if isinstance(result, List):
                    nodes_mask = nodes_mask.repeat(len(result))
                    result = torch.cat(result)
                result = self.post_proc(result)
                # Apply nodes mask, so that the loss may be computed properly
                result[~nodes_mask] = 0

            if self.training:
                if isinstance(result, List):
                    result = torch.cat(result)
                target = torch.ones_like(result)
                target[~nodes_mask] = 0

                loss = self.loss_fn(result, target)
                # In case, the loss function applies mean reduction, the result
                # has to be rescaled by the effective size of the batch
                # (excluding padding).
                if self.mean_reduction_in_loss:
                    real_size = torch.count_nonzero(nodes_mask)
                    loss = loss * size / real_size

                return result, loss

            return result

    model = AggrWrapper(aggr, loss_fn=loss_fn, post_proc=post_proc)

    stepper = TrainingStepper(model,
                              atol=atol,
                              rtol=rtol,
                              equal_nan=equal_nan,
                              enable_fp_exception=enable_fp_exception)

    if dataloader is not None:
        for step, batch in enumerate(dataloader):
            if step == num_steps:
                break

            stepper.run(
                1, (batch.x, batch.edge_index, batch.nodes_mask, dim_size))


================================================
FILE: tests/gnn/nn/aggr/conftest.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import pytest
from torch_geometric import seed_everything
from torch_geometric.datasets import FakeDataset
from torch_geometric.transforms import NormalizeFeatures

from poptorch_geometric.dataloader import FixedSizeDataLoader
from poptorch_geometric.fixed_size_options import FixedSizeOptions
from poptorch_geometric.pyg_dataloader import FixedSizeStrategy


@pytest.fixture
def dataloader():
    seed_everything(42)

    dataset = FakeDataset(num_graphs=4,
                          avg_num_nodes=8,
                          avg_degree=3,
                          transform=NormalizeFeatures(),
                          num_channels=8)

    dataloader = FixedSizeDataLoader(
        dataset,
        fixed_size_options=FixedSizeOptions(num_nodes=12, num_edges=32),
        fixed_size_strategy=FixedSizeStrategy.StreamPack,
        add_pad_masks=True)

    return dataloader


================================================
FILE: tests/gnn/nn/aggr/test_attention.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import torch
from torch_geometric.nn import MLP
from torch_geometric.nn.aggr import AttentionalAggregation

from aggr_utils import aggr_harness


def test_attentional_aggregation(dataloader):
    first_sample = next(iter(dataloader))
    in_channels = first_sample.num_node_features
    out_channels = in_channels * 2

    gate_nn = MLP([in_channels, 1], act='relu')
    nn = MLP([in_channels, in_channels], act='relu')
    aggr = AttentionalAggregation(gate_nn, nn)
    post_proc = torch.nn.Linear(in_channels, out_channels)

    aggr_harness(aggr,
                 first_sample.num_nodes,
                 dataloader,
                 post_proc,
                 atol=1e-3,
                 rtol=5e-3)


================================================
FILE: tests/gnn/nn/aggr/test_basic.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import pytest
import torch
from torch_geometric.nn import (
    MaxAggregation,
    MeanAggregation,
    MinAggregation,
    MulAggregation,
    PowerMeanAggregation,
    SoftmaxAggregation,
    StdAggregation,
    SumAggregation,
    VarAggregation,
)

from aggr_utils import aggr_harness


@pytest.mark.parametrize('Aggregation', [
    MeanAggregation,
    SumAggregation,
    MaxAggregation,
    MinAggregation,
    MulAggregation,
    VarAggregation,
    StdAggregation,
])
def test_basic_aggregation(dataloader, Aggregation):
    first_sample = next(iter(dataloader))
    in_channels = first_sample.num_node_features
    out_channels = in_channels * 2

    aggr = Aggregation()
    post_proc = torch.nn.Linear(in_channels, out_channels)

    aggr_harness(aggr, first_sample.num_nodes, dataloader, post_proc)


@pytest.mark.parametrize('Aggregation', [
    SoftmaxAggregation,
    PowerMeanAggregation,
])
@pytest.mark.parametrize('learn', [True, False])
def test_gen_aggregation(dataloader, Aggregation, learn):
    first_sample = next(iter(dataloader))
    in_channels = first_sample.num_node_features
    out_channels = in_channels * 2

    aggr = Aggregation(learn=learn)
    post_proc = torch.nn.Linear(in_channels, out_channels)

    if isinstance(aggr, PowerMeanAggregation):
        enable_fp_exception = False
        equal_nan = True
    else:
        enable_fp_exception = True
        equal_nan = False

    aggr_harness(aggr,
                 first_sample.num_nodes,
                 dataloader,
                 post_proc,
                 equal_nan=equal_nan,
                 enable_fp_exception=enable_fp_exception)


@pytest.mark.parametrize('Aggregation', [
    SoftmaxAggregation,
    PowerMeanAggregation,
])
def test_learnable_channels_aggregation(dataloader, Aggregation):
    first_sample = next(iter(dataloader))
    channels = first_sample.num_node_features

    aggr = Aggregation(learn=True, channels=channels)

    if isinstance(aggr, PowerMeanAggregation):
        enable_fp_exception = False
        equal_nan = True
    else:
        enable_fp_exception = True
        equal_nan = False

    aggr_harness(aggr,
                 first_sample.num_nodes,
                 dataloader,
                 equal_nan=equal_nan,
                 enable_fp_exception=enable_fp_exception)


================================================
FILE: tests/gnn/nn/aggr/test_deep_sets.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

from torch_geometric.nn import DeepSetsAggregation, Linear

from aggr_utils import aggr_harness


def test_deep_sets_aggregation(dataloader):
    first_sample = next(iter(dataloader))
    channels = first_sample.num_node_features

    aggr = DeepSetsAggregation(
        local_nn=Linear(channels, channels * 2),
        global_nn=Linear(channels * 2, channels * 4),
    )
    aggr.reset_parameters()

    aggr_harness(aggr, first_sample.num_nodes, dataloader)


================================================
FILE: tests/gnn/nn/aggr/test_equilibrium.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import pytest
from torch_geometric.nn import EquilibriumAggregation

from aggr_utils import aggr_harness


@pytest.mark.skip(reason="TODO(AFS-354)")
@pytest.mark.parametrize('grad_iter', [0, 1, 5])
def test_equilibrium(dataloader, grad_iter):
    first_sample = next(iter(dataloader))
    channels = first_sample.num_node_features

    aggr = EquilibriumAggregation(channels,
                                  channels // 2,
                                  num_layers=[10, 10],
                                  grad_iter=grad_iter)

    aggr_harness(aggr, first_sample.num_nodes, dataloader)


================================================
FILE: tests/gnn/nn/aggr/test_fused.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import pytest
import torch
from torch_geometric.nn.aggr.fused import FusedAggregation

from aggr_utils import aggr_harness


@pytest.mark.parametrize('aggrs', [
    ['sum', 'mean', 'min', 'max', 'mul', 'var', 'std'],
    ['sum', 'min', 'max', 'mul', 'var', 'std'],
    ['min', 'max', 'mul', 'var', 'std'],
    ['mean', 'min', 'max', 'mul', 'var', 'std'],
    ['sum', 'min', 'max', 'mul', 'std'],
    ['mean', 'min', 'max', 'mul', 'std'],
    ['min', 'max', 'mul', 'std'],
])
def test_fused_aggregation(dataloader, aggrs):
    first_sample = next(iter(dataloader))
    in_channels = first_sample.num_node_features
    out_channels = in_channels * 2

    aggr = FusedAggregation(aggrs)
    post_proc = torch.nn.Linear(in_channels, out_channels)

    aggr_harness(aggr, first_sample.num_nodes, dataloader, post_proc)


================================================
FILE: tests/gnn/nn/aggr/test_gmt.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import pytest
from torch_geometric.nn.aggr import GraphMultisetTransformer

from aggr_utils import aggr_harness


@pytest.mark.skip(reason="TODO(AFS-351)")
def test_graph_multiset_transformer(dataloader):
    first_sample = next(iter(dataloader))
    print(first_sample)
    print(first_sample.num_nodes)
    channels = first_sample.num_node_features
    aggr = GraphMultisetTransformer(channels, k=2, heads=2)
    aggr.reset_parameters()

    aggr_harness(aggr,
                 first_sample.num_nodes,
                 dataloader,
                 sorted_index=True,
                 enable_fp_exception=False,
                 equal_nan=True)


================================================
FILE: tests/gnn/nn/aggr/test_gru.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

from torch_geometric.nn import GRUAggregation

from aggr_utils import aggr_harness


def test_gru_aggregation(dataloader):
    first_sample = next(iter(dataloader))
    channels = first_sample.num_node_features

    aggr = GRUAggregation(channels, channels * 2)
    aggr.reset_parameters()

    aggr_harness(aggr, first_sample.num_nodes, dataloader, sorted_index=True)


================================================
FILE: tests/gnn/nn/aggr/test_lstm.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

from torch_geometric.nn import LSTMAggregation

from aggr_utils import aggr_harness


def test_lstm_aggregation(dataloader):
    first_sample = next(iter(dataloader))
    channels = first_sample.num_node_features

    aggr = LSTMAggregation(channels, channels * 2)

    aggr_harness(aggr, first_sample.num_nodes, dataloader, sorted_index=True)


================================================
FILE: tests/gnn/nn/aggr/test_mlp_aggr.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

from torch_geometric.nn import MLPAggregation

from aggr_utils import aggr_harness


def test_mlp_aggregation(dataloader):
    first_sample = next(iter(dataloader))
    channels = first_sample.num_node_features

    aggr = MLPAggregation(
        in_channels=channels,
        out_channels=channels * 2,
        max_num_elements=first_sample.num_nodes,
        num_layers=1,
    )

    aggr_harness(aggr, first_sample.num_nodes, dataloader, sorted_index=True)


================================================
FILE: tests/gnn/nn/aggr/test_multi.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import pytest
import torch
from torch_geometric.nn import MultiAggregation

from aggr_utils import aggr_harness


@pytest.mark.parametrize('mode', [
    'cat', 'proj', 'attn', 'sum', 'mean', 'max', 'min', 'logsumexp', 'std',
    'var'
])
def test_multi_aggr(dataloader, mode):
    first_sample = next(iter(dataloader))
    in_channels = first_sample.num_node_features
    out_channels = in_channels * 2

    mode_kwargs = None
    if mode == 'proj':
        mode_kwargs = dict(in_channels=in_channels, out_channels=in_channels)
    elif mode == 'attn':
        mode_kwargs = dict(in_channels=in_channels,
                           out_channels=in_channels,
                           num_heads=in_channels // 4)

    aggrs = ['mean', 'sum', 'max']
    aggr = MultiAggregation(aggrs, mode=mode, mode_kwargs=mode_kwargs)
    aggr.reset_parameters()

    if mode == 'cat':
        # The 'cat' combine mode will expand the output dimensions
        # the number of aggregators.
        in_channels = in_channels * len(aggrs)
        out_channels = out_channels * len(aggrs)

    post_proc = torch.nn.Linear(in_channels, out_channels)

    aggr_harness(aggr,
                 first_sample.num_nodes,
                 dataloader,
                 post_proc,
                 atol=1e-3)


================================================
FILE: tests/gnn/nn/aggr/test_quantile.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import pytest
import torch
from torch_geometric.nn import MedianAggregation, QuantileAggregation

from aggr_utils import aggr_harness


@pytest.mark.parametrize('q', [0., .1, .2, .3, .4, .5, .6, .7, .8, .9, 1.])
@pytest.mark.parametrize('interpolation', QuantileAggregation.interpolations)
def test_quantile_aggregation(dataloader, q, interpolation):
    torch.manual_seed(42)
    first_sample = next(iter(dataloader))
    in_channels = first_sample.num_node_features
    out_channels = in_channels * 2

    aggr = QuantileAggregation(q=q, interpolation=interpolation)
    post_proc = torch.nn.Linear(in_channels, out_channels)

    aggr_harness(aggr,
                 first_sample.num_nodes,
                 dataloader,
                 post_proc,
                 sorted_index=True)


def test_median_aggregation(dataloader):
    first_sample = next(iter(dataloader))
    in_channels = first_sample.num_node_features
    out_channels = in_channels * 2

    aggr = MedianAggregation()
    post_proc = torch.nn.Linear(in_channels, out_channels)

    aggr_harness(aggr, first_sample.num_nodes, dataloader, post_proc)


================================================
FILE: tests/gnn/nn/aggr/test_scaler.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import pytest
import torch
from torch_geometric.nn import DegreeScalerAggregation

from aggr_utils import aggr_harness


@pytest.mark.parametrize('scaler',
                         [['identity'], ['amplification'], ['attenuation'],
                          ['linear'], ['inverse_linear']])
@pytest.mark.parametrize('train_norm', [True, False])
def test_degree_scaler_aggregation(dataloader, scaler, train_norm):

    first_sample = next(iter(dataloader))
    in_channels = first_sample.num_node_features
    out_channels = in_channels * 2

    deg = torch.tensor([2, 5, 3, 1, 2, 3, 4, 1, 2, 0])

    basic_aggrs = ['mean', 'sum', 'max']
    aggr = DegreeScalerAggregation(basic_aggrs,
                                   scaler,
                                   deg,
                                   train_norm=train_norm)
    post_proc = torch.nn.Linear(in_channels * len(basic_aggrs),
                                out_channels * len(basic_aggrs))

    aggr_harness(aggr, first_sample.num_nodes, dataloader, post_proc)


================================================
FILE: tests/gnn/nn/aggr/test_set2set.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

from torch_geometric.nn.aggr import Set2Set

from aggr_utils import aggr_harness


def test_set2set(dataloader):
    first_sample = next(iter(dataloader))
    channels = first_sample.num_node_features

    aggr = Set2Set(in_channels=channels, processing_steps=1)

    aggr_harness(aggr, first_sample.num_nodes, dataloader)


================================================
FILE: tests/gnn/nn/aggr/test_set_transformer.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import pytest
from torch_geometric.nn.aggr import SetTransformerAggregation

from aggr_utils import aggr_harness


@pytest.mark.skip(reason="TODO(AFS-351)")
def test_set_transformer_aggregation(dataloader):
    first_sample = next(iter(dataloader))
    channels = first_sample.num_node_features

    aggr = SetTransformerAggregation(channels, num_seed_points=2, heads=2)
    aggr.reset_parameters()

    aggr_harness(aggr,
                 first_sample.num_nodes,
                 dataloader,
                 sorted_index=True,
                 enable_fp_exception=False,
                 equal_nan=True)


================================================
FILE: tests/gnn/nn/aggr/test_sort.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import torch
from torch_geometric.nn.aggr import SortAggregation

from aggr_utils import aggr_harness


def test_sort_aggregation(dataloader):
    first_sample = next(iter(dataloader))
    in_channels = first_sample.num_node_features
    out_channels = in_channels * 2

    k = 5
    aggr = SortAggregation(k=k)
    post_proc = torch.nn.Linear(k * in_channels, k * out_channels)

    aggr_harness(aggr,
                 first_sample.num_nodes,
                 dataloader,
                 post_proc,
                 sorted_index=True)


================================================
FILE: tests/gnn/nn/conftest.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import pytest

from torch_geometric import seed_everything
from torch_geometric.datasets import FakeDataset
from torch_geometric.transforms import Compose, GCNNorm, NormalizeFeatures

from poptorch_geometric.dataloader import FixedSizeDataLoader, DataLoader
from poptorch_geometric.fixed_size_options import FixedSizeOptions
from poptorch_geometric.pyg_dataloader import FixedSizeStrategy


def get_dataset(num_channels=16):
    seed_everything(0)
    transform = Compose([GCNNorm(), NormalizeFeatures()])

    dataset = FakeDataset(avg_num_nodes=32,
                          avg_degree=5,
                          transform=transform,
                          num_channels=num_channels)
    data = dataset[0]
    data.num_classes = dataset.num_classes

    return data


@pytest.fixture
def dataset():
    return get_dataset()


@pytest.fixture
def fake_dataset():
    seed_everything(0)

    dataset = FakeDataset(num_graphs=4,
                          avg_num_nodes=8,
                          avg_degree=3,
                          transform=NormalizeFeatures(),
                          num_channels=10)
    return dataset


@pytest.fixture
def fixed_size_dataloader(fake_dataset):
    dataloader = FixedSizeDataLoader(
        fake_dataset,
        fixed_size_options=FixedSizeOptions(num_nodes=12),
        fixed_size_strategy=FixedSizeStrategy.StreamPack,
        add_pad_masks=True)
    return dataloader


@pytest.fixture
def dataloader(fake_dataset):
    dataloader = DataLoader(fake_dataset, shuffle=False)
    return dataloader


================================================
FILE: tests/gnn/nn/conv/conv_utils.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.

import torch
from torch_geometric import seed_everything
from torch_geometric.data import HeteroData

from poptorch_geometric import TrainingStepper


def conv_harness(conv,
                 dataset=None,
                 post_proc=None,
                 loss_fn=torch.nn.MSELoss(),
                 num_steps=4,
                 atol=1e-5,
                 rtol=1e-4,
                 batch=None,
                 training=True):
    class ConvWrapper(torch.nn.Module):
        def __init__(self, conv, loss_fn, post_proc=None):
            super().__init__()
            self.conv = conv
            self.loss_fn = loss_fn
            self.post_proc = post_proc

        def forward(self, *args):
            x = self.conv(*args)
            if self.post_proc is not None:
                x = self.post_proc(x)

            if isinstance(x, tuple):
                x = x[0]

            if self.training:
                target = torch.ones_like(x)
                loss = self.loss_fn(x, target)
                return x, loss

            return x

    model = ConvWrapper(conv, loss_fn=loss_fn, post_proc=post_proc)

    if batch is None and dataset is not None:
        batch = (dataset.x, dataset.edge_index)

    stepper = TrainingStepper(model, atol=atol, rtol=rtol)
    if training:
        stepper.run(num_steps, batch)
    else:
        stepper.run_inference(batch)


def generate_edge_index(num_src_nodes, num_dst_nodes, num_edges):
    row = torch.randint(num_src_nodes, (num_edges, ), dtype=torch.long)
    col = torch.randint(num_dst_nodes, (num_edges, ), dtype=torch.long)
    return torch.stack([row, col], dim=0)


def random_heterodata(in_channels=None):
    seed_everything(0)

    if in_channels is None:
        in_channels = {'author': 16, 'paper': 12, 'term': 3}

    data = HeteroData()
    data['author'].x = torch.randn(6, in_channels['author'])
    data['paper'].x = torch.randn(5, in_channels['paper'])
    data['term'].x = torch.randn(4, in_channels['term'])

    data[('author', 'author')].edge_index = generate_edge_index(6, 6, 15)
    data[('author', 'paper')].edge_index = generate_edge_index(6, 5, 10)
    data[('paper', 'term')].edge_index = generate_edge_index(5, 4, 8)
    return data, in_channels


def hetero_conv_harness(conv,
                        data,
                        output_key,
                        forward_args=None,
                        loss_fn=torch.nn.MSELoss(),
                        num_steps=4,
                        atol=1e-3,
                        rtol=1e-2,
                        enable_fp_exception=True):

    if forward_args is None:
        forward_args = ['x_dict', 'edge_index_dict']

    class ConvWrapper(torch.nn.Module):
        def __init__(self, conv, loss_fn):
            super().__init__()
            self.conv = conv
            self.loss_fn = loss_fn

        def forward(self, *args):
            out = self.conv(*args)
            out = out[output_key]
            if self.training:
                target = torch.ones_like(out)
                loss = self.loss_fn(out, target)
                return out, loss
            return out

    model = ConvWrapper(conv, loss_fn)

    stepper = TrainingStepper(model,
                              atol=atol,
                              rtol=rtol,
                              enable_fp_exception=enable_fp_exception)
    inputs = [getattr(data, f_arg) for f_arg in forward_args]
    stepper.run(num_steps, inputs)


================================================
FILE: tests/gnn/nn/conv/test_agnn_conv.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
from torch_geometric.nn import AGNNConv
from conv_utils import conv_harness

conv_kwargs = {"add_self_loops": False}


def test_agnn_conv(dataset):
    conv = AGNNConv(**conv_kwargs)

    conv_harness(conv, dataset)


================================================
FILE: tests/gnn/nn/conv/test_antisymmetric_conv.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
from torch_geometric.nn import AntiSymmetricConv
from torch_geometric.nn.conv import GCNConv

from conv_utils import conv_harness


def test_antisymmetric_conv(dataset):
    in_channels = dataset.num_node_features
    phi = GCNConv(in_channels, in_channels, bias=False, add_self_loops=False)
    conv = AntiSymmetricConv(in_channels, phi=phi)

    conv_harness(conv, dataset)


================================================
FILE: tests/gnn/nn/conv/test_appnp.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import torch
from torch_geometric.nn import APPNP
from conv_utils import conv_harness

out_channels = 16
conv_kwargs = {"add_self_loops": False}


def test_appnp(dataset):
    in_channels = dataset.num_node_features
    lin = torch.nn.Linear(in_channels, out_channels)
    conv = APPNP(K=10, alpha=0.1, dropout=0.0, **conv_kwargs)

    conv_harness(conv, dataset, post_proc=lin)


================================================
FILE: tests/gnn/nn/conv/test_arma_conv.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
from torch_geometric.nn import ARMAConv
from conv_utils import conv_harness

out_channels = 32


def test_arma_conv(dataset):
    in_channels = dataset.num_node_features
    conv = ARMAConv(in_channels, out_channels, num_stacks=8, num_layers=4)

    conv_harness(conv, dataset)


================================================
FILE: tests/gnn/nn/conv/test_cg_conv.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import pytest
from torch_geometric.nn import CGConv
from conv_utils import conv_harness


@pytest.mark.parametrize('batch_norm', [False])
def test_cg_conv(dataset, batch_norm):
    in_channels = dataset.num_node_features
    conv = CGConv(in_channels, batch_norm=batch_norm)

    conv_harness(conv, dataset)


================================================
FILE: tests/gnn/nn/conv/test_cheb_conv.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import torch
import pytest
from torch_geometric.nn import ChebConv

from conv_utils import conv_harness


@pytest.mark.skip(
    reason="ChebConv won't work, because algorithm requires removing "
    "self loops and we are adding self loops to ensure that "
    "tensors have fixed size.")
def test_cheb_conv(dataset):
    in_channels = dataset.num_node_features
    out_channels = 32
    conv = ChebConv(in_channels, out_channels, K=3, add_self_loops=False)
    conv_harness(conv, dataset)

    batch = (dataset.x, dataset.edge_index, dataset.edge_weight)
    conv_harness(conv, batch=batch)

    batch = (dataset.x, dataset.edge_index, dataset.edge_weight, None, 3.0)
    conv_harness(conv, batch=batch)

    num_nodes = dataset.num_nodes
    batch_mask = [int(i > num_nodes // 2) for i in range(num_nodes)]
    batch_mask = torch.tensor(batch_mask)
    lambda_max = torch.tensor([2.0, 3.0])
    batch = (dataset.x, dataset.edge_index, dataset.edge_weight, batch_mask,
             lambda_max)
    conv_harness(conv, batch=batch)


================================================
FILE: tests/gnn/nn/conv/test_cluster_gcn_conv.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
from torch_geometric.nn import ClusterGCNConv
from conv_utils import conv_harness


def test_cluster_gcn_conv(dataset):
    in_channels = dataset.num_node_features
    out_channels = 32
    conv = ClusterGCNConv(in_channels,
                          out_channels,
                          diag_lambda=1.,
                          add_self_loops=False)
    conv_harness(conv, dataset)


================================================
FILE: tests/gnn/nn/conv/test_dna_conv.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import pytest
import torch
from torch_geometric.nn import DNAConv

from conv_utils import conv_harness

conv_kwargs_list = [{
    'heads': 4,
    'groups': 8,
}, {
    'heads': 4,
    'groups': 8,
}, {
    'heads': 4,
    'groups': 8,
    'cached': True
}]


@pytest.mark.parametrize('conv_kwargs', conv_kwargs_list)
def test_dna_conv(conv_kwargs):
    channels = 32
    num_layers = 3
    edge_index = torch.tensor([[0, 0, 0, 1, 2, 3], [1, 2, 3, 0, 0, 0]])
    num_nodes = edge_index.max().item() + 1
    x = torch.randn((num_nodes, num_layers, channels))

    conv = DNAConv(channels, dropout=0.0, add_self_loops=False, **conv_kwargs)
    conv_harness(conv, batch=(x, edge_index))


================================================
FILE: tests/gnn/nn/conv/test_edge_conv.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
from torch.nn import Linear as Lin
from torch.nn import ReLU
from torch.nn import Sequential as Seq
from torch_geometric.nn import DynamicEdgeConv, EdgeConv
from conv_utils import conv_harness

out_channels = 32


def test_edge_conv(dataset):
    in_channels = dataset.num_node_features
    nn = Seq(Lin(in_channels * 2, in_channels), ReLU(),
             Lin(in_channels, out_channels))
    conv = EdgeConv(nn)

    conv_harness(conv, dataset)


def test_dynamic_edge_conv(dataset):
    in_channels = dataset.num_node_features
    nn = Seq(Lin(in_channels * 2, in_channels), ReLU(),
             Lin(in_channels, out_channels))
    conv = DynamicEdgeConv(nn, k=2)

    conv_harness(conv, dataset, batch=(dataset.x, ))


================================================
FILE: tests/gnn/nn/conv/test_eg_conv.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import pytest
from torch_geometric.nn import EGConv
from conv_utils import conv_harness

conv_kwargs_list = [
    {
        "add_self_loops": False
    },
    {
        "add_self_loops": False,
        "aggregators": ["max", "min"]
    },
]


@pytest.mark.parametrize('conv_kwargs', conv_kwargs_list)
def test_eg_conv(dataset, conv_kwargs):
    in_channels = dataset.num_node_features
    conv = EGConv(in_channels, 32, **conv_kwargs)

    conv_harness(conv, dataset)


================================================
FILE: tests/gnn/nn/conv/test_fa_conv.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
from torch_geometric.nn import FAConv
from conv_utils import conv_harness

conv_kwargs = {"add_self_loops": False}


def test_fa_conv(dataset):
    in_channels = dataset.num_node_features
    conv = FAConv(in_channels, eps=1.0, **conv_kwargs)
    batch = (dataset.x, dataset.x, dataset.edge_index)

    conv_harness(conv, dataset, batch=batch)


================================================
FILE: tests/gnn/nn/conv/test_feast_conv.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
from torch_geometric.nn import FeaStConv
from conv_utils import conv_harness

out_channels = 32
conv_kwargs = {"add_self_loops": False}


def test_feast_conv(dataset):
    in_channels = dataset.num_node_features
    conv = FeaStConv(in_channels, out_channels, heads=2, **conv_kwargs)

    conv_harness(conv, dataset)


================================================
FILE: tests/gnn/nn/conv/test_film_conv.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import pytest
from torch_geometric.nn import FiLMConv
from conv_utils import conv_harness

out_channels = 32


@pytest.mark.parametrize('num_relations', [1])
def test_film_conv(dataset, num_relations):
    in_channels = dataset.num_node_features
    conv = FiLMConv(in_channels, out_channels, num_relations=num_relations)

    conv_harness(conv, dataset)


================================================
FILE: tests/gnn/nn/conv/test_gat_conv.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import pytest
from torch_geometric.nn import GATConv
from conv_utils import conv_harness

out_channels = 32
conv_kwargs_list = [
    {
        'edge_dim': None
    },
    {
        'edge_dim': 1,
        'fill_value': 0.5
    },
    {
        'edge_dim': 1,
        'fill_value': 'mean'
    },
    {
        'edge_dim': 4,
        'fill_value': 0.5
    },
    {
        'edge_dim': 4,
        'fill_value': 'mean'
    },
]


@pytest.mark.parametrize('conv_kwargs', conv_kwargs_list)
def test_gat_conv(dataset, conv_kwargs):
    in_channels = dataset.num_node_features
    conv_kwargs["add_self_loops"] = False

    conv = GATConv(in_channels, out_channels, heads=2, **conv_kwargs)
    conv_harness(conv, dataset)


================================================
FILE: tests/gnn/nn/conv/test_gated_graph_conv.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
from torch_geometric.nn import GatedGraphConv
from conv_utils import conv_harness

out_channels = 32


def test_gated_graph_conv(dataset):
    in_channels = dataset.num_node_features
    conv = GatedGraphConv(in_channels, num_layers=3)

    conv_harness(conv, dataset)


================================================
FILE: tests/gnn/nn/conv/test_gatv2_conv.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import pytest
from torch_geometric.nn import GATv2Conv
from conv_utils import conv_harness

out_channels = 32
conv_kwargs_list = [
    {
        'edge_dim': None
    },
    {
        'edge_dim': 1,
        'fill_value': 0.5
    },
    {
        'edge_dim': 1,
        'fill_value': 'mean'
    },
    {
        'edge_dim': 4,
        'fill_value': 0.5
    },
    {
        'edge_dim': 4,
        'fill_value': 'mean'
    },
]


@pytest.mark.parametrize('conv_kwargs', conv_kwargs_list)
def test_gatv2_conv(dataset, conv_kwargs):
    in_channels = dataset.num_node_features
    conv_kwargs["add_self_loops"] = False
    conv = GATv2Conv(in_channels, out_channels, heads=2, **conv_kwargs)

    conv_harness(conv, dataset, atol=1e-4, rtol=1e-3)


================================================
FILE: tests/gnn/nn/conv/test_gcn2_conv.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import torch
from torch_geometric.nn import GCN2Conv
from conv_utils import conv_harness

out_channels = 16


def test_gcn2_conv(dataset):
    print(dataset)
    in_channels = dataset.num_node_features
    conv = GCN2Conv(in_channels, alpha=float(0.2), add_self_loops=False)
    x2 = torch.randn_like(dataset.x)
    batch = (dataset.x, x2, dataset.edge_index)
    conv_harness(conv, dataset, batch=batch, num_steps=1)


================================================
FILE: tests/gnn/nn/conv/test_gcn_conv.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import pytest
from torch_geometric.nn import GCNConv
from conv_utils import conv_harness

out_channels = 32
conv_kwargs = {'add_self_loops': False}


@pytest.mark.parametrize('flow', ['source_to_target', 'target_to_source'])
def test_gcn_conv(dataset, flow):
    in_channels = dataset.num_node_features
    conv = GCNConv(in_channels, out_channels, flow, **conv_kwargs)

    conv_harness(conv, dataset)


================================================
FILE: tests/gnn/nn/conv/test_gen_conv.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import pytest
import torch
from torch_geometric.nn import GENConv

from conv_utils import conv_harness


@pytest.mark.parametrize('aggr', ['softmax', 'powermean'])
def test_gen_conv(aggr, dataset):
    in_channels = dataset.num_node_features

    conv = GENConv(in_channels,
                   32,
                   aggr,
                   edge_dim=16,
                   add_self_loops=False,
                   norm='layer')
    conv_harness(conv, dataset)

    x2 = torch.randn(dataset.x.shape)
    batch = ((dataset.x, x2), dataset.edge_index)
    conv_harness(conv, dataset, batch=batch)

    conv = GENConv((in_channels, in_channels),
                   32,
                   aggr,
                   add_self_loops=False,
                   norm='layer')
    conv_harness(conv, dataset, batch=batch)


================================================
FILE: tests/gnn/nn/conv/test_general_conv.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import pytest
import torch
from torch_geometric.nn import GeneralConv
from conv_utils import conv_harness

out_channels = 32
num_edge_attr = 16

conv_kwargs_list = [{
    'skip_linear': True
}, {
    'directed_msg': False
}, {
    'heads': 3
}, {
    'attention': True
}, {
    'heads': 3,
    'attention': True
}, {
    'heads': 3,
    'attention': True,
    'attention_type': 'dot_product'
}, {
    'l2_normalize': True
}]


@pytest.mark.parametrize('conv_kwargs', conv_kwargs_list)
def test_general_conv(dataset, conv_kwargs):
    in_channels = dataset.num_node_features
    conv = GeneralConv(in_channels, out_channels, num_edge_attr, **conv_kwargs)

    e1 = torch.randn(dataset.num_edges, num_edge_attr)

    batch = (dataset.x, dataset.edge_index, e1)
    conv_harness(conv, dataset, batch=batch)


================================================
FILE: tests/gnn/nn/conv/test_gin_conv.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import torch
from torch.nn import Linear as Lin
from torch.nn import ReLU
from torch.nn import Sequential as Seq
from torch_geometric.nn import GINConv, GINEConv
from conv_utils import conv_harness

out_channels = 32


def test_gin_conv(dataset):
    in_channels = dataset.num_node_features
    nn = Seq(Lin(in_channels, 32), ReLU(), Lin(32, 32))
    conv = GINConv(nn, train_eps=True)

    conv_harness(conv, dataset)


def test_gine_conv(dataset):
    in_channels = dataset.num_node_features
    nn = Seq(Lin(in_channels, 32), ReLU(), Lin(32, 32))

    conv = GINEConv(nn, train_eps=True)

    value = torch.randn(dataset.num_edges, 16)
    batch = (dataset.x, dataset.edge_index, value)

    conv_harness(conv, dataset, batch=batch)


================================================
FILE: tests/gnn/nn/conv/test_gmm_conv.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import pytest
import torch
from torch_geometric.nn import GMMConv

from conv_utils import conv_harness


@pytest.mark.parametrize('separate_gaussians', [True, False])
def test_gmm_conv(separate_gaussians, dataset):
    in_channels = dataset.num_node_features
    conv = GMMConv(in_channels,
                   32,
                   dim=3,
                   kernel_size=25,
                   separate_gaussians=separate_gaussians,
                   add_self_loops=False)
    value = torch.rand(dataset.num_edges, 3)
    batch = (dataset.x, dataset.edge_index, value)
    conv_harness(conv, batch=batch)


@pytest.mark.parametrize('separate_gaussians', [True, False])
def test_gmm_conv_bipartite(separate_gaussians, dataset):

    in_channels = dataset.num_node_features
    conv = GMMConv((in_channels, in_channels),
                   32,
                   dim=3,
                   kernel_size=5,
                   separate_gaussians=separate_gaussians,
                   add_self_loops=False)
    value = torch.rand(dataset.num_edges, 3)
    x2 = torch.randn(dataset.x.shape)
    batch = ((dataset.x, x2), dataset.edge_index, value)
    conv_harness(conv, batch=batch)

    batch = ((dataset.x, None), dataset.edge_index, value)
    conv_harness(conv, batch=batch)


================================================
FILE: tests/gnn/nn/conv/test_gps_conv.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import pytest
import torch
from torch_geometric.nn import GPSConv, SAGEConv

from conv_utils import conv_harness


@pytest.mark.skip(reason="TODO(AFS-279, AFS-162)")
@pytest.mark.parametrize('norm', [None, 'batch_norm', 'layer_norm'])
def test_gps_conv(norm, dataset):
    in_channels = dataset.num_node_features
    conv = GPSConv(in_channels,
                   conv=SAGEConv(16, 16, add_self_loops=False),
                   heads=4,
                   norm=norm)
    conv.reset_parameters()

    conv_harness(conv, dataset)


@pytest.mark.skip(reason="TODO(AFS-279, AFS-162)")
@pytest.mark.parametrize('norm', [None, 'batch_norm', 'layer_norm'])
def test_gps_conv_with_batch_index_tensor(norm, dataset):
    in_channels = dataset.num_node_features
    conv = GPSConv(in_channels,
                   conv=SAGEConv(16, 16, add_self_loops=False),
                   heads=4,
                   norm=norm)
    conv.reset_parameters()

    batch_index = [
        i > dataset.num_nodes // 2 for i in range(dataset.num_nodes)
    ]
    batch_index = torch.tensor(batch_index, dtype=torch.int64)

    batch = (dataset.x, dataset.edge_index, batch_index)
    conv_harness(conv, batch=batch)


================================================
FILE: tests/gnn/nn/conv/test_graph_conv.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
from torch_geometric.nn import GraphConv
from conv_utils import conv_harness

out_channels = 16


def test_graph_conv(dataset):
    in_channels = dataset.num_node_features
    conv = GraphConv(in_channels, out_channels)

    conv_harness(conv, dataset)


================================================
FILE: tests/gnn/nn/conv/test_gravnet_conv.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import torch
from torch_geometric.nn import GravNetConv
from torch_geometric.testing import withPackage

from conv_utils import conv_harness


@withPackage('torch_cluster')
def test_gravnet_conv(dataset):
    in_channels = dataset.num_node_features
    out_channels = 32
    conv = GravNetConv(in_channels,
                       out_channels,
                       space_dimensions=4,
                       propagate_dimensions=8,
                       k=2,
                       add_self_loops=False)
    conv_harness(conv, batch=(dataset.x, ))

    num_nodes = dataset.num_nodes
    batch_index = [1 if i > num_nodes // 2 else 0 for i in range(num_nodes)]
    conv_harness(conv, batch=(dataset.x, batch_index))

    x2 = torch.randn_like(dataset.x)
    conv_harness(conv, batch=((dataset.x, x2), ), atol=5e-05, rtol=0.001)
    conv_harness(conv,
                 batch=((dataset.x, x2), (torch.Tensor(batch_index),
                                          torch.Tensor(batch_index))),
                 atol=5e-03,
                 rtol=0.1)


================================================
FILE: tests/gnn/nn/conv/test_han_conv.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
from torch_geometric.nn import HANConv

from conv_utils import hetero_conv_harness, random_heterodata


def test_han_conv():
    data, in_channels = random_heterodata()
    metadata = data.metadata()

    conv = HANConv(in_channels, 16, metadata, heads=2, add_self_loops=False)
    hetero_conv_harness(conv, data, 'author')


def test_han_conv_lazy():
    data, _ = random_heterodata()
    metadata = data.metadata()

    conv = HANConv(-1, 16, metadata, heads=2, add_self_loops=False)
    _ = conv(data.x_dict, data.edge_index_dict)
    hetero_conv_harness(conv, data, 'author')


================================================
FILE: tests/gnn/nn/conv/test_heat_conv.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import pytest
import torch
from torch_geometric.nn import HEATConv

from conv_utils import conv_harness


@pytest.mark.parametrize('concat', [True, False])
def test_heat_conv(concat):
    x = torch.randn(4, 8)
    edge_index = torch.tensor([[0, 1, 2, 3], [0, 0, 1, 1]])
    edge_attr = torch.randn((4, 2))
    node_type = torch.tensor([0, 0, 1, 2])
    edge_type = torch.tensor([0, 2, 1, 2])

    conv = HEATConv(in_channels=8,
                    out_channels=16,
                    num_node_types=3,
                    num_edge_types=3,
                    edge_type_emb_dim=5,
                    edge_dim=2,
                    edge_attr_emb_dim=6,
                    heads=2,
                    concat=concat,
                    add_self_loops=False)

    conv_harness(conv,
                 batch=(x, edge_index, node_type, edge_type, edge_attr),
                 atol=5e-4,
                 rtol=0.3)


================================================
FILE: tests/gnn/nn/conv/test_hetero_conv.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import pytest
import torch

from torch_geometric.data import HeteroData
from torch_geometric.nn import (GATConv, GCNConv, HeteroConv, Linear,
                                MessagePassing, SAGEConv, to_hetero)
import torch_geometric.transforms as T

from conv_utils import hetero_conv_harness


def get_edge_index(num_src_nodes, num_dst_nodes, num_edges):
    row = torch.randint(num_src_nodes, (num_edges, ), dtype=torch.long)
    col = torch.randint(num_dst_nodes, (num_edges, ), dtype=torch.long)
    return torch.stack([row, col], dim=0)


def get_dummy_data():
    data = HeteroData()
    data['paper'].x = torch.randn(50, 32)
    data['author'].x = torch.randn(30, 64)
    data['paper', 'paper'].edge_index = get_edge_index(50, 50, 200)
    data['paper', 'author'].edge_index = get_edge_index(50, 30, 100)
    data['paper', 'author'].edge_attr = torch.randn(100, 3)
    data['author', 'paper'].edge_index = get_edge_index(30, 50, 100)
    data['paper', 'paper'].edge_weight = torch.rand(200)
    data['author', 'author'].edge_index = get_edge_index(30, 30, 100)
    return data


@pytest.mark.parametrize('aggr', ['sum', 'mean', 'min', 'max', 'cat', None])
def test_hetero_conv(aggr):
    data = get_dummy_data()

    conv = HeteroConv(
        {
            ('paper', 'to', 'paper'):
            GCNConv(-1, 64, add_self_loops=False),
            ('author', 'to', 'paper'):
            SAGEConv((-1, -1), 64, add_self_loops=False),
            ('paper', 'to', 'author'):
            GATConv((-1, -1), 64, edge_dim=3, add_self_loops=False),
        },
        aggr=aggr)

    _ = conv(data.x_dict,
             data.edge_index_dict,
             data.edge_attr_dict,
             edge_weight_dict=data.edge_weight_dict)

    forward_args = ('x_dict', 'edge_index_dict', 'edge_attr_dict',
                    'edge_weight_dict')
    hetero_conv_harness(conv, data, 'author', forward_args=forward_args)


@pytest.mark.parametrize('aggr', ['sum', 'mean', 'min', 'max', 'cat'])
@pytest.mark.parametrize('num_layers', [2, 5])
def test_hetero_conv_multiple_layers(aggr, num_layers):
    data = get_dummy_data()

    class MultiLayerHeteroConv(torch.nn.Module):
        def __init__(self, num_layers):
            super().__init__()
            self.convs = torch.nn.ModuleList()
            for _ in range(num_layers):
                self.convs.append(
                    HeteroConv(
                        {
                            ('paper', 'to', 'paper'):
                            GCNConv(-1, 64, add_self_loops=False),
                            ('author', 'to', 'paper'):
                            SAGEConv((-1, -1), 64, add_self_loops=False),
                            ('paper', 'to', 'author'):
                            GATConv(
                                (-1, -1), 64, edge_dim=3,
                                add_self_loops=False),
                        },
                        aggr=aggr))

        def forward(self, x_dict, edge_index_dict, *args, **kwargs):
            for conv in self.convs:
                x_dict = conv(x_dict, edge_index_dict, *args, **kwargs)
                x_dict = {key: x.relu() for key, x in x_dict.items()}
            return x_dict

    conv = MultiLayerHeteroConv(num_layers)

    _ = conv(data.x_dict,
             data.edge_index_dict,
             data.edge_attr_dict,
             edge_weight_dict=data.edge_weight_dict)

    forward_args = ('x_dict', 'edge_index_dict', 'edge_attr_dict',
                    'edge_weight_dict')
    hetero_conv_harness(conv,
                        data,
                        'author',
                        forward_args=forward_args,
                        enable_fp_exception=False)


@pytest.mark.parametrize('aggr', ['sum', 'mean', 'min', 'max', 'cat'])
@pytest.mark.parametrize('num_layers', [2, 5])
def test_hetero_conv_multiple_layers_with_data_transforms(aggr, num_layers):
    data = get_dummy_data()
    data = T.ToUndirected()(data)
    data = T.AddSelfLoops()(data)
    data = T.NormalizeFeatures()(data)

    class MultiLayerHeteroConv(torch.nn.Module):
        def __init__(self, num_layers):
            super().__init__()
            self.convs = torch.nn.ModuleList()
            for _ in range(num_layers):
                self.convs.append(
                    HeteroConv(
                        {
                            ('paper', 'to', 'paper'):
                            GCNConv(-1, 64, add_self_loops=False),
                            ('author', 'to', 'author'):
                            GCNConv(-1, 64, add_self_loops=False),
                            ('author', 'to', 'paper'):
                            SAGEConv((-1, -1), 64, add_self_loops=False),
                            ('paper', 'to', 'author'):
                            GATConv(
                                (-1, -1), 64, edge_dim=3,
                                add_self_loops=False),
                            ('paper', 'rev_to', 'author'):
                            SAGEConv((-1, -1), 64, add_self_loops=False),
                            ('author', 'rev_to', 'paper'):
                            GATConv(
                                (-1, -1), 64, edge_dim=3,
                                add_self_loops=False),
                        },
                        aggr=aggr))

        def forward(self, x_dict, edge_index_dict, *args, **kwargs):
            for conv in self.convs:
                x_dict = conv(x_dict, edge_index_dict, *args, **kwargs)
                x_dict = {key: x.relu() for key, x in x_dict.items()}
            return x_dict

    conv = MultiLayerHeteroConv(num_layers)

    _ = conv(data.x_dict,
             data.edge_index_dict,
             data.edge_attr_dict,
             edge_weight_dict=data.edge_weight_dict)

    forward_args = ('x_dict', 'edge_index_dict', 'edge_attr_dict',
                    'edge_weight_dict')
    hetero_conv_harness(conv, data, 'author', forward_args=forward_args)


# pylint: disable=abstract-method
# pylint: disable=arguments-differ
class CustomConv(MessagePassing):
    def __init__(self, out_channels):
        super().__init__(aggr='add')
        self.lin = Linear(-1, out_channels)

    def forward(self, x, edge_index, y, z):
        return self.propagate(edge_index, x=x, y=y, z=z)

    def message(self, x_j, y_j, z_j):
        return self.lin(torch.cat([x_j, y_j, z_j], dim=-1))


def test_hetero_conv_with_custom_conv():
    data = HeteroData()
    data['paper'].x = torch.randn(50, 32)
    data['paper'].y = torch.randn(50, 3)
    data['paper'].z = torch.randn(50, 3)
    data['author'].x = torch.randn(30, 64)
    data['author'].y = torch.randn(30, 3)
    data['author'].z = torch.randn(30, 3)
    data['paper', 'paper'].edge_index = get_edge_index(50, 50, 200)
    data['paper', 'author'].edge_index = get_edge_index(50, 30, 100)
    data['author', 'paper'].edge_index = get_edge_index(30, 50, 100)

    conv = HeteroConv({key: CustomConv(64) for key in data.edge_types})

    _ = conv(data.x_dict, data.edge_index_dict, data.y_dict, data.z_dict)

    forward_args = ('x_dict', 'edge_index_dict', 'y_dict', 'z_dict')
    hetero_conv_harness(conv, data, 'author', forward_args=forward_args)


@pytest.mark.parametrize('aggr', ['sum', 'mean', 'min', 'max'])
def test_to_hetero_transformation_basic(aggr):
    data = get_dummy_data()

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.conv1 = SAGEConv((-1, -1), 64)
            self.conv2 = SAGEConv((-1, -1), 64)

        def forward(self, x, edge_index):
            x = self.conv1(x, edge_index).relu()
            x = self.conv2(x, edge_index)
            return x

    model = Model()
    model = to_hetero(model, data.metadata(), aggr=aggr)

    _ = model(data.x_dict, data.edge_index_dict)

    forward_args = ('x_dict', 'edge_index_dict')
    hetero_conv_harness(model, data, 'author', forward_args=forward_args)


@pytest.mark.parametrize('aggr', ['sum', 'mean', 'min', 'max'])
def test_to_hetero_transformation_skip_connections(aggr):
    data = get_dummy_data()

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.conv1 = SAGEConv((-1, -1), 64)
            self.lin1 = Linear(-1, 64)
            self.conv2 = SAGEConv((-1, -1), 64)
            self.lin2 = Linear(-1, 64)

        def forward(self, x, edge_index):
            x = self.conv1(x, edge_index) + self.lin1(x)
            x = x.relu()
            x = self.conv2(x, edge_index) + self.lin2(x)
            return x

    model = Model()
    model = to_hetero(model, data.metadata(), aggr=aggr)

    _ = model(data.x_dict, data.edge_index_dict)

    forward_args = ('x_dict', 'edge_index_dict')
    hetero_conv_harness(model, data, 'author', forward_args=forward_args)


================================================
FILE: tests/gnn/nn/conv/test_hgt_conv.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

from collections import defaultdict

from torch_geometric.nn import HGTConv

from conv_utils import hetero_conv_harness, random_heterodata


def test_hgt_conv_same_dimensions():
    in_channels = defaultdict(lambda: 16)

    data, _ = random_heterodata(in_channels)

    conv = HGTConv(in_channels['author'],
                   in_channels['paper'],
                   metadata=data.metadata(),
                   heads=2)
    hetero_conv_harness(conv, data, 'author')


def test_hgt_conv_different_dimensions():
    in_channels = defaultdict(lambda: 16)
    in_channels['paper'] = 32

    data, _ = random_heterodata(in_channels)

    conv = HGTConv(in_channels=in_channels,
                   out_channels=32,
                   metadata=data.metadata(),
                   heads=2)

    hetero_conv_harness(conv, data, 'author')


def test_hgt_conv_lazy():
    in_channels = defaultdict(lambda: 16)
    in_channels['paper'] = 32

    data, _ = random_heterodata(in_channels)

    conv = HGTConv(-1, 32, metadata=data.metadata(), heads=2)

    _ = conv(data.x_dict, data.edge_index_dict)
    hetero_conv_harness(conv, data, 'author')


================================================
FILE: tests/gnn/nn/conv/test_hypergraph_conv.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import torch
from torch_geometric.nn import HypergraphConv

from conv_utils import conv_harness


def test_hypergraph_conv_with_more_nodes_than_edges():
    torch.manual_seed(42)
    in_channels, out_channels = (16, 32)
    hyperedge_index = torch.tensor([[0, 0, 1, 1, 2, 3], [0, 1, 0, 1, 0, 1]])
    hyperedge_weight = torch.tensor([1.0, 0.5])
    num_nodes = hyperedge_index[0].max().item() + 1
    num_edges = hyperedge_index[1].max().item() + 1
    x = torch.randn((num_nodes, in_channels))
    hyperedge_attr = torch.randn((num_edges, in_channels))

    conv = HypergraphConv(in_channels, out_channels, add_self_loops=False)

    conv_harness(conv, batch=(x, hyperedge_index, None, None, num_edges))

    conv = HypergraphConv(in_channels,
                          out_channels,
                          use_attention=True,
                          heads=2,
                          add_self_loops=False)

    conv_harness(conv,
                 batch=(x, hyperedge_index, hyperedge_weight, hyperedge_attr,
                        num_edges))


def test_hypergraph_conv_with_more_edges_than_nodes():
    torch.manual_seed(42)
    in_channels, out_channels = (16, 32)
    hyperedge_index = torch.tensor([[0, 0, 1, 1, 2, 3, 3, 3, 2, 1, 2],
                                    [0, 1, 2, 1, 2, 1, 0, 3, 3, 4, 4]])
    hyperedge_weight = torch.tensor([1.0, 0.5, 0.8, 0.2, 0.7])
    num_nodes = hyperedge_index[0].max().item() + 1
    num_edges = hyperedge_index[1].max().item() + 1
    x = torch.randn((num_nodes, in_channels))

    conv = HypergraphConv(in_channels, out_channels)

    conv_harness(conv, batch=(x, hyperedge_index, None, None, num_edges))
    conv_harness(conv,
                 batch=(x, hyperedge_index, hyperedge_weight, None, num_edges))


================================================
FILE: tests/gnn/nn/conv/test_le_conv.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
from torch_geometric.nn import LEConv
from conv_utils import conv_harness

out_channels = 16


def test_le_conv(dataset):
    in_channels = dataset.num_node_features
    conv = LEConv(in_channels, out_channels)

    conv_harness(conv, dataset)


================================================
FILE: tests/gnn/nn/conv/test_lg_conv.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import torch
from torch_geometric.nn import LGConv
from conv_utils import conv_harness

out_channels = 16


def test_lg_conv(dataset):
    in_channels = dataset.num_node_features
    conv = LGConv()
    lin = torch.nn.Linear(in_channels, out_channels)

    conv_harness(conv, dataset, post_proc=lin)


def test_lg_edge_weights_conv(dataset):
    in_channels = dataset.num_node_features
    conv = LGConv()
    lin = torch.nn.Linear(in_channels, out_channels)

    batch = (dataset.x, dataset.edge_index, dataset.edge_weight)
    conv_harness(conv, dataset, batch=batch, post_proc=lin)


================================================
FILE: tests/gnn/nn/conv/test_mf_conv.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import torch
from torch_geometric.nn import MFConv

from conv_utils import conv_harness


def test_mf_conv(dataset):
    in_channels = dataset.num_node_features
    out_channels = 32

    conv = MFConv(in_channels, out_channels, add_self_loops=False)

    conv_harness(conv, dataset)

    conv = MFConv((in_channels, in_channels),
                  out_channels,
                  add_self_loops=False)

    x2 = torch.randn(dataset.x.shape)
    batch = ((dataset.x, x2), dataset.edge_index)
    conv_harness(conv, batch=batch)

    batch = ((dataset.x, None), dataset.edge_index)
    conv_harness(conv, batch=batch)


================================================
FILE: tests/gnn/nn/conv/test_nn_conv.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import torch
from torch.nn import Linear as Lin
from torch.nn import ReLU
from torch.nn import Sequential as Seq
from torch_geometric.nn import NNConv
from conv_utils import conv_harness

out_channels = 16


def test_nn_conv(dataset):
    in_channels = dataset.num_node_features
    nn = Seq(Lin(3, 32), ReLU(), Lin(32, 8 * 32))
    conv = NNConv(in_channels, out_channels, nn=nn)

    value = torch.rand(dataset.num_edges, 3)
    batch = (dataset.x, dataset.edge_index, value)

    conv_harness(conv, dataset, batch=batch)


================================================
FILE: tests/gnn/nn/conv/test_pan_conv.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import pytest
from torch_geometric.nn import PANConv

from conv_utils import conv_harness


@pytest.mark.skip(reason="TODO(AFS-262)")
def test_pan_conv(dataset):
    in_channels = dataset.num_node_features
    conv = PANConv(in_channels, 32, filter_size=2, add_self_loops=False)

    conv_harness(conv, dataset)


================================================
FILE: tests/gnn/nn/conv/test_pdn_conv.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import torch
from torch_geometric.nn import PDNConv
from conv_utils import conv_harness

out_channels = 16


def test_pdn_conv(dataset):
    in_channels = dataset.num_node_features
    conv = PDNConv(in_channels,
                   out_channels,
                   edge_dim=8,
                   hidden_channels=128,
                   add_self_loops=False)

    edge_attr = torch.randn(dataset.num_edges, 8)
    batch = (dataset.x, dataset.edge_index, edge_attr)
    conv_harness(conv, dataset, batch=batch)


================================================
FILE: tests/gnn/nn/conv/test_pna_conv.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import torch
from torch_geometric.nn import PNAConv
from conv_utils import conv_harness

out_channels = 16

aggregators = ['sum', 'mean', 'min', 'max', 'var', 'std']
scalers = [
    'identity', 'amplification', 'attenuation', 'linear', 'inverse_linear'
]


def test_pna_conv(dataset):
    in_channels = dataset.num_node_features
    deg = PNAConv.get_degree_histogram([dataset])

    conv = PNAConv(in_channels,
                   out_channels,
                   aggregators,
                   scalers,
                   deg=deg,
                   edge_dim=3,
                   towers=4)

    value = torch.rand(dataset.num_edges, 3)
    batch = (dataset.x, dataset.edge_index, value)
    conv_harness(conv, dataset, batch=batch)


================================================
FILE: tests/gnn/nn/conv/test_point_conv.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import torch
from torch.nn import Linear as Lin
from torch.nn import ReLU
from torch.nn import Sequential as Seq
from torch_geometric.nn import PointNetConv
from conv_utils import conv_harness

out_channels = 16


def test_point_net_conv(dataset):

    local_nn = Seq(Lin(16 + 3, 32), ReLU(), Lin(32, 32))
    global_nn = Seq(Lin(32, 32))
    conv = PointNetConv(local_nn, global_nn, add_self_loops=False)

    pos = torch.rand(dataset.num_nodes, 3)
    batch = (dataset.x, pos, dataset.edge_index)
    conv_harness(conv, dataset, batch=batch)


def test_point2_net_conv(dataset):

    local_nn = Seq(Lin(16 + 3, 32), ReLU(), Lin(32, 32))
    global_nn = Seq(Lin(32, 32))
    conv = PointNetConv(local_nn, global_nn, add_self_loops=False)

    pos1 = torch.rand(dataset.num_nodes, 3)
    pos2 = torch.rand(dataset.num_nodes, 3)

    batch = (dataset.x, (pos1, pos2), dataset.edge_index)
    conv_harness(conv, dataset, batch=batch)


================================================
FILE: tests/gnn/nn/conv/test_point_gnn_conv.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import torch
from torch_geometric import seed_everything
from torch_geometric.nn import MLP, PointGNNConv

from conv_utils import conv_harness


def test_pointgnn_conv():
    seed_everything(42)
    x = torch.rand(6, 8)
    pos = torch.rand(6, 3)
    edge_index = torch.tensor([[0, 1, 1, 1, 2, 5], [1, 2, 3, 4, 3, 4]])

    conv = PointGNNConv(
        mlp_h=MLP([8, 16, 3], norm='LayerNorm'),
        mlp_f=MLP([3 + 8, 16, 8], norm='LayerNorm'),
        mlp_g=MLP([8, 16, 8], norm='LayerNorm'),
    )

    batch = (x, pos, edge_index)
    conv_harness(conv, batch=batch)


================================================
FILE: tests/gnn/nn/conv/test_point_transformer_conv.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import torch
from torch.nn import Linear as Lin
from torch.nn import ReLU
from torch.nn import Sequential as Seq
from torch_geometric.nn import PointTransformerConv
from conv_utils import conv_harness

out_channels = 32


def test_point_transformer_conv(dataset):
    in_channels = dataset.num_node_features
    conv = PointTransformerConv(in_channels,
                                out_channels,
                                add_self_loops=False)

    pos = torch.rand(dataset.num_nodes, 3)

    batch = (dataset.x, pos, dataset.edge_index)
    conv_harness(conv, dataset, batch=batch, atol=1e-4, rtol=1e-3)


def test_point_transformer_nn_conv(dataset):
    in_channels = dataset.num_node_features
    pos_nn = Seq(Lin(3, 16), ReLU(), Lin(16, 32))
    attn_nn = Seq(Lin(32, 32), ReLU(), Lin(32, 32))
    conv = PointTransformerConv(in_channels,
                                out_channels,
                                pos_nn,
                                attn_nn,
                                add_self_loops=False)

    pos = torch.rand(dataset.num_nodes, 3)

    batch = (dataset.x, pos, dataset.edge_index)
    conv_harness(conv, dataset, batch=batch, atol=1e-3, rtol=1e-2)


================================================
FILE: tests/gnn/nn/conv/test_ppf_conv.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import torch
import torch.nn.functional as F
from torch.nn import Linear as Lin
from torch.nn import ReLU
from torch.nn import Sequential as Seq
from torch_geometric.nn import PPFConv
from conv_utils import conv_harness


def test_ppf_conv(dataset):

    local_nn = Seq(Lin(16 + 4, 32), ReLU(), Lin(32, 32))
    global_nn = Seq(Lin(32, 32))
    conv = PPFConv(local_nn, global_nn, add_self_loops=False)

    pos = torch.rand(dataset.num_nodes, 3)
    n = F.normalize(torch.rand(dataset.num_nodes, 3), dim=-1)

    batch = (dataset.x, pos, n, dataset.edge_index)
    conv_harness(conv, dataset, batch=batch)


def test_ppf2_conv(dataset):

    local_nn = Seq(Lin(16 + 4, 32), ReLU(), Lin(32, 32))
    global_nn = Seq(Lin(32, 32))
    conv = PPFConv(local_nn, global_nn, add_self_loops=False)

    pos1 = torch.rand(dataset.num_nodes, 3)
    pos2 = torch.rand(dataset.num_nodes, 3)
    n1 = F.normalize(torch.rand(dataset.num_nodes, 3), dim=-1)
    n2 = F.normalize(torch.rand(dataset.num_nodes, 3), dim=-1)

    batch = (dataset.x, (pos1, pos2), (n1, n2), dataset.edge_index)
    conv_harness(conv, dataset, batch=batch)


================================================
FILE: tests/gnn/nn/conv/test_res_gated_graph_conv.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
from torch_geometric.nn import ResGatedGraphConv
from conv_utils import conv_harness

out_channels = 16


def test_res_gated_graph_conv(dataset):
    in_channels = dataset.num_node_features

    conv = ResGatedGraphConv(in_channels, out_channels)
    conv_harness(conv, dataset)


================================================
FILE: tests/gnn/nn/conv/test_rgat_conv.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import pytest
import torch
from torch_geometric import seed_everything
from torch_geometric.nn import RGATConv

from conv_utils import conv_harness


@pytest.mark.parametrize('mod', [
    'additive',
    'scaled',
    'f-additive',
    'f-scaled',
])
@pytest.mark.parametrize('attention_mechanism', [
    'within-relation',
    'across-relation',
])
@pytest.mark.parametrize('attention_mode', [
    'additive-self-attention',
    'multiplicative-self-attention',
])
def test_rgat_conv(mod, attention_mechanism, attention_mode):
    seed_everything(0)

    if attention_mechanism == 'within-relation':
        pytest.skip("Condition from torch.nonzero is used to compute softmax. "
                    "Fixed size tensor can change softmax result.")

    if mod != 'additive' or attention_mode != 'additive-self-attention':
        pytest.skip("TODO(AFS-200)")

    x = torch.randn(4, 8)
    edge_index = torch.tensor([[0, 1, 2, 3], [0, 0, 1, 1]])
    edge_type = torch.tensor([0, 2, 1, 2])
    edge_attr = torch.randn((4, 8))

    conv = RGATConv(8,
                    20,
                    num_relations=4,
                    num_bases=4,
                    mod=mod,
                    attention_mechanism=attention_mechanism,
                    attention_mode=attention_mode,
                    heads=2,
                    dim=1,
                    edge_dim=8,
                    add_self_loops=False)

    batch = (x, edge_index, edge_type, edge_attr)
    conv_harness(conv, batch=batch)


================================================
FILE: tests/gnn/nn/conv/test_rgcn_conv.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import pytest
import torch
from torch_geometric.nn import FastRGCNConv, RGCNConv
from conv_utils import conv_harness

out_channels = 16


@pytest.mark.parametrize('rgcn', [FastRGCNConv, RGCNConv])
def test_rgcn_conv(rgcn):
    if rgcn == RGCNConv:
        pytest.skip("RGCNConv uses dynamic shapes")

    in_channels = 4
    out_channels = 32
    num_relations = 4
    edge_index = torch.tensor([[0, 1, 1, 2, 2, 3], [0, 0, 1, 0, 1, 1]])
    edge_type = torch.tensor([0, 1, 1, 0, 0, 1])
    conv = rgcn(in_channels,
                out_channels,
                num_relations,
                num_bases=15,
                add_self_loops=False)

    batch = (None, edge_index, edge_type)
    conv_harness(conv, batch=batch)


================================================
FILE: tests/gnn/nn/conv/test_sage_conv.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
from torch_geometric.nn import SAGEConv
from conv_utils import conv_harness

out_channels = 16

aggregators = ['sum', 'mean', 'min', 'max', 'var', 'std']


def test_sage_conv(dataset):
    in_channels = dataset.num_node_features

    conv = SAGEConv(in_channels,
                    out_channels,
                    aggr=aggregators,
                    normalize=True,
                    root_weight=True,
                    project=True,
                    bias=True)

    conv_harness(conv, dataset)


================================================
FILE: tests/gnn/nn/conv/test_sg_conv.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
from torch_geometric.nn import SGConv
from conv_utils import conv_harness

out_channels = 16


def test_sg_conv(dataset):
    in_channels = dataset.num_node_features
    conv = SGConv(in_channels, out_channels, K=10, add_self_loops=False)

    conv_harness(conv, dataset)


def test_sg_weights_conv(dataset):
    in_channels = dataset.num_node_features
    conv = SGConv(in_channels, out_channels, K=10, add_self_loops=False)

    batch = (dataset.x, dataset.edge_index, dataset.edge_weight)
    conv_harness(conv, dataset, batch=batch)


================================================
FILE: tests/gnn/nn/conv/test_signed_conv.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import torch
from torch_geometric.nn import SignedConv
from conv_utils import conv_harness

out_channels = 16


def test_signed_conv(dataset):

    in_channels = dataset.num_node_features

    class Convs(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.conv1 = SignedConv(in_channels,
                                    out_channels,
                                    first_aggr=True,
                                    add_self_loops=False)

            self.conv2 = SignedConv(out_channels,
                                    32,
                                    first_aggr=False,
                                    add_self_loops=False)

        def forward(self, x, pos_edge_index, neg_edge_index):
            x = self.conv1(x, pos_edge_index, neg_edge_index)
            x = self.conv2(x, pos_edge_index, neg_edge_index)
            return x

    conv = Convs()

    batch = (dataset.x, dataset.edge_index, dataset.edge_index)
    conv_harness(conv, dataset, batch=batch)


================================================
FILE: tests/gnn/nn/conv/test_simple_conv.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import pytest

import torch
from torch_geometric.nn import SimpleConv

from conv_utils import conv_harness


@pytest.mark.parametrize('combine_root', ['sum', 'cat', 'self_loop', None])
def test_simple_conv(dataset, combine_root):
    in_channels = dataset.num_node_features
    out_channels = 64

    if combine_root == 'cat':
        in_channels = in_channels * 2

    lin = torch.nn.Linear(in_channels, out_channels)
    conv = SimpleConv(combine_root=combine_root)

    conv_harness(conv, dataset, post_proc=lin)


================================================
FILE: tests/gnn/nn/conv/test_spline_conv.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import pytest
import torch
from torch_geometric.nn import SplineConv
from torch_geometric.testing import withPackage

from conv_utils import conv_harness


@pytest.mark.parametrize("training", [True, False])
@withPackage('torch_spline_conv')
def test_spline_conv(training):
    if training:
        pytest.skip('reason="TODO(AFS-216, AFS-218)')
    x1 = torch.randn(4, 4)
    x2 = torch.randn(2, 8)
    edge_index = torch.tensor([[0, 1, 2, 3], [0, 0, 1, 1]])
    value = torch.rand(edge_index[0].size(0), 3)
    conv = SplineConv(4, 32, dim=3, kernel_size=5)

    conv_harness(conv, batch=(x1, edge_index, value), training=training)

    conv = SplineConv((4, 8), 32, dim=3, kernel_size=5)
    batch = ((x1, x2), edge_index, value)
    conv_harness(conv, batch=batch, training=training)

    batch = ((x1, None), edge_index, value, (4, 2))
    conv_harness(conv, batch=batch, training=training)


================================================
FILE: tests/gnn/nn/conv/test_ssg_conv.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import torch
from torch_geometric.nn import SSGConv

from conv_utils import conv_harness


def test_ssg_conv(dataset):
    in_channels = dataset.num_node_features
    out_channels = 32

    conv = SSGConv(in_channels,
                   out_channels,
                   alpha=0.1,
                   K=10,
                   add_self_loops=False)
    conv_harness(conv, dataset)

    value = torch.rand(dataset.num_edges)
    conv_harness(conv, batch=(dataset.x, dataset.edge_index, value))


================================================
FILE: tests/gnn/nn/conv/test_supergat_conv.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import pytest
from torch_geometric.nn import SuperGATConv
from conv_utils import conv_harness

out_channels = 16


@pytest.mark.skip(reason="TODO(AFS-36)")
@pytest.mark.parametrize('att_type', ['MX', 'SD'])
def test_supergat_conv(dataset, att_type):
    in_channels = dataset.num_node_features
    conv = SuperGATConv(in_channels,
                        out_channels,
                        heads=2,
                        attention_type=att_type,
                        neg_sample_ratio=1.0,
                        edge_sample_ratio=1.0,
                        add_self_loops=False)

    conv_harness(conv, dataset)


================================================
FILE: tests/gnn/nn/conv/test_tag_conv.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
from torch_geometric.nn import TAGConv
from conv_utils import conv_harness

out_channels = 16


def test_tag_conv(dataset):
    in_channels = dataset.num_node_features
    conv = TAGConv(in_channels, out_channels)

    conv_harness(conv, dataset)


def test_tag_weights_conv(dataset):
    in_channels = dataset.num_node_features
    conv = TAGConv(in_channels, out_channels)

    batch = (dataset.x, dataset.edge_index, dataset.edge_weight)
    conv_harness(conv, dataset, batch=batch)


================================================
FILE: tests/gnn/nn/conv/test_transformer_conv.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
from torch_geometric.nn import TransformerConv
from conv_utils import conv_harness

out_channels = 16


def test_transformer_conv(dataset):
    in_channels = dataset.num_node_features
    conv = TransformerConv(in_channels, out_channels, heads=2, beta=True)

    conv_harness(conv, dataset)


================================================
FILE: tests/gnn/nn/conv/test_wl_conv.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import pytest
import torch
from torch_geometric.nn import WLConv

from conv_utils import conv_harness


@pytest.mark.skip(reason="Algorithm requires reading tensors which "
                  "are placed on the IPU.")
def test_wl_conv():
    x = torch.tensor([1, 0, 0, 1])
    edge_index = torch.tensor([[0, 1, 1, 2, 2, 3], [1, 0, 2, 1, 3, 2]])
    conv = WLConv()
    _ = conv(x, edge_index)
    conv_harness(conv, batch=(x, edge_index), training=False)


================================================
FILE: tests/gnn/nn/conv/test_wl_conv_continuous.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import torch
from torch_geometric.nn import WLConvContinuous

from conv_utils import conv_harness


def test_wl_conv_cont(dataset):
    in_channels = dataset.num_node_features
    conv = WLConvContinuous()

    lin = torch.nn.Linear(in_channels, 8)
    conv_harness(conv, dataset, post_proc=lin)

    batch = ((dataset.x, None), dataset.edge_index, dataset.edge_weight)
    conv_harness(conv, batch=batch, post_proc=lin)

    x2 = torch.randn(dataset.x.shape)
    batch = ((dataset.x, x2), dataset.edge_index, dataset.edge_weight)
    conv_harness(conv, batch=batch, post_proc=lin)


================================================
FILE: tests/gnn/nn/conv/test_x_conv.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import torch
from torch_geometric.nn import XConv
from torch_geometric.testing import withPackage

from conv_utils import conv_harness


@withPackage('torch_cluster')
def test_x_conv():
    x = torch.randn(8, 16)
    pos = torch.rand(8, 5)
    batch = torch.tensor([0, 0, 0, 0, 1, 1, 1, 1])

    conv = XConv(16, 32, dim=5, kernel_size=2, dilation=2)

    torch.manual_seed(0)
    # We need to pass very loose atol and rtol here due to TODO(AFS-276)
    conv_harness(conv, batch=(x, pos), atol=0.1, rtol=0.1)
    conv_harness(conv, batch=(x, pos, batch), atol=0.1, rtol=0.1)


================================================
FILE: tests/gnn/nn/dense/dense_utils.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import torch

from poptorch_geometric import TrainingStepper


def dense_harness(dense,
                  batch=None,
                  post_proc=None,
                  loss_fn=torch.nn.MSELoss(),
                  num_steps=4,
                  atol=1e-5,
                  rtol=1e-4):
    class DenseWrapper(torch.nn.Module):
        def __init__(self, dense, loss_fn, post_proc=None):
            super().__init__()
            self.dense = dense
            self.loss_fn = loss_fn
            self.post_proc = post_proc

        def forward(self, *args):
            x = self.dense(*args)
            if self.post_proc is not None:
                x = self.post_proc(x)
            if self.training:
                target = torch.ones_like(x)
                loss = self.loss_fn(x, target)
                return x, loss

            return x

    model = DenseWrapper(dense, loss_fn=loss_fn, post_proc=post_proc)

    stepper = TrainingStepper(model, atol=atol, rtol=rtol)

    stepper.run(num_steps, batch)


================================================
FILE: tests/gnn/nn/dense/test_convs.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import pytest
from torch_geometric.nn import DenseGCNConv, DenseGraphConv, DenseGINConv, DenseSAGEConv
import torch
from torch.nn import Linear as Lin
from torch.nn import ReLU
from torch.nn import Sequential as Seq
from dense_utils import dense_harness


@pytest.mark.parametrize(
    "conv_fn", [DenseGCNConv, DenseGraphConv, DenseGINConv, DenseSAGEConv])
def test_dense_convs(conv_fn):
    channels = 16
    if conv_fn is DenseGINConv:
        nn = Seq(Lin(channels, channels), ReLU(), Lin(channels, channels))
        conv = conv_fn(nn)
    else:
        conv = conv_fn(channels, channels)
    x = torch.randn((5, channels))
    x = torch.cat([x, x.new_zeros(1, channels)], dim=0).view(2, 3, channels)
    adj = torch.Tensor([
        [
            [0, 1, 1],
            [1, 0, 1],
            [1, 1, 0],
        ],
        [
            [0, 1, 0],
            [1, 0, 0],
            [0, 0, 0],
        ],
    ])
    mask = torch.tensor([[1, 1, 1], [1, 1, 0]], dtype=torch.bool)

    batch = (x, adj, mask)
    dense_out = conv(*batch)
    assert dense_out.size() == (2, 3, channels)
    assert dense_out[1, 2].abs().sum().item() == 0

    dense_harness(conv, batch)


@pytest.mark.parametrize(
    "conv_fn", [DenseGCNConv, DenseGraphConv, DenseGINConv, DenseSAGEConv])
def test_dense_convs_with_broadcasting(conv_fn):
    batch_size, num_nodes, channels = 8, 3, 16
    if conv_fn is DenseGINConv:
        nn = Seq(Lin(channels, channels), ReLU(), Lin(channels, channels))
        conv = conv_fn(nn)
    else:
        conv = conv_fn(channels, channels)

    x = torch.randn(batch_size, num_nodes, channels)
    adj = torch.Tensor([
        [0, 1, 1],
        [1, 0, 1],
        [1, 1, 0],
    ])

    assert conv(x, adj).size() == (batch_size, num_nodes, channels)
    mask = torch.tensor([1, 1, 1], dtype=torch.bool)
    batch = (x, adj, mask)
    assert conv(*batch).size() == (batch_size, num_nodes, channels)

    dense_harness(conv, batch)


================================================
FILE: tests/gnn/nn/functional/test_bro.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import pytest
import torch
from torch_geometric.nn.functional import bro
import poptorch


@pytest.mark.skip(reason="TODO(AFS-269)")
def test_bro():
    batch = torch.tensor([0, 0, 0, 0, 1, 1, 1, 2, 2])

    g1 = torch.tensor([
        [0.2, 0.2, 0.2, 0.2],
        [0.0, 0.2, 0.2, 0.2],
        [0.2, 0.0, 0.2, 0.2],
        [0.2, 0.2, 0.0, 0.2],
    ])

    g2 = torch.tensor([
        [0.2, 0.2, 0.2, 0.2],
        [0.0, 0.2, 0.2, 0.2],
        [0.2, 0.0, 0.2, 0.2],
    ])

    g3 = torch.tensor([
        [0.2, 0.2, 0.2, 0.2],
        [0.2, 0.0, 0.2, 0.2],
    ])

    class Model(torch.nn.Module):
        def forward(self, g1, g2, g3, batch):
            return bro(torch.cat([g1, g2, g3], dim=0), batch)

    model = Model()
    poptorch_model = poptorch.inferenceModel(model)

    ipu_out = poptorch_model(g1, g2, g3, batch)

    s = 0.
    for g in [torch.cat([g1, g2, g3]) / 3]:
        s += torch.norm(g @ g.t() - torch.eye(g.shape[0]), p=2)

    assert torch.isclose(s / 3., ipu_out)


================================================
FILE: tests/gnn/nn/functional/test_gini.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import torch
from torch_geometric.nn.functional import gini

import poptorch


def test_gini():

    w = torch.tensor([[0., 0., 0., 0.], [0., 0., 0., 1000.0]])

    class Model(torch.nn.Module):
        def forward(self, w):
            return gini(w)

    model = Model()
    poptorch_model = poptorch.inferenceModel(model)

    ipu_out = poptorch_model(w)

    assert torch.isclose(ipu_out, torch.tensor(0.5))


================================================
FILE: tests/gnn/nn/kge/kge_utils.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

from typing import List
import torch

from poptorch_geometric import TrainingStepper


def kge_harness(kge,
                dataloader,
                post_proc=None,
                loss_fn=torch.nn.MSELoss(),
                num_steps=4,
                atol=5e-3,
                rtol=5e-3,
                equal_nan=False,
                enable_fp_exception=True):
    class KgeWrapper(torch.nn.Module):
        def __init__(self, kge, loss_fn, post_proc=None):
            super().__init__()
            self.model = kge
            self.loss_fn = loss_fn
            self.post_proc = post_proc

        def forward(self, *args):
            result = self.model(*args)

            if self.post_proc is not None:
                if isinstance(result, List):
                    result = torch.cat(result)
                result = self.post_proc(result)

            if self.training:
                if isinstance(result, List):
                    result = torch.cat(result)
                target = torch.ones_like(result)

                loss = self.loss_fn(result, target)
                return result, loss

            return result

    model = KgeWrapper(kge, loss_fn=loss_fn, post_proc=post_proc)

    stepper = TrainingStepper(model,
                              atol=atol,
                              rtol=rtol,
                              equal_nan=equal_nan,
                              enable_fp_exception=enable_fp_exception)

    if dataloader is not None:
        for step, batch in enumerate(dataloader):
            if step == num_steps:
                break
            stepper.run(1, batch)


================================================
FILE: tests/gnn/nn/kge/test_complex.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import torch

from torch_geometric.nn import ComplEx
from kge_utils import kge_harness


def test_complex_scoring():
    model = ComplEx(num_nodes=5, num_relations=2, hidden_channels=1)

    model.node_emb.weight.data = torch.tensor([
        [2.],
        [3.],
        [5.],
        [1.],
        [2.],
    ])
    model.node_emb_im.weight.data = torch.tensor([
        [4.],
        [1.],
        [3.],
        [1.],
        [2.],
    ])
    model.rel_emb.weight.data = torch.tensor([
        [2.],
        [3.],
    ])
    model.rel_emb_im.weight.data = torch.tensor([
        [3.],
        [1.],
    ])

    head_index = torch.tensor([1, 3])
    rel_type = torch.tensor([1, 0])
    tail_index = torch.tensor([2, 4])

    loader = model.loader(head_index, rel_type, tail_index, batch_size=5)
    kge_harness(model, loader)


def test_complex():
    model = ComplEx(num_nodes=10, num_relations=5, hidden_channels=32)
    assert str(model) == 'ComplEx(10, num_relations=5, hidden_channels=32)'

    head_index = torch.tensor([0, 2, 4, 6, 8])
    rel_type = torch.tensor([0, 1, 2, 3, 4])
    tail_index = torch.tensor([1, 3, 5, 7, 9])

    loader = model.loader(head_index, rel_type, tail_index, batch_size=5)
    kge_harness(model, loader)


================================================
FILE: tests/gnn/nn/kge/test_distmult.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import torch

from torch_geometric.nn import DistMult
from kge_utils import kge_harness


def test_distmult():
    model = DistMult(num_nodes=10, num_relations=5, hidden_channels=32)
    assert str(model) == 'DistMult(10, num_relations=5, hidden_channels=32)'

    head_index = torch.tensor([0, 2, 4, 6, 8])
    rel_type = torch.tensor([0, 1, 2, 3, 4])
    tail_index = torch.tensor([1, 3, 5, 7, 9])

    loader = model.loader(head_index, rel_type, tail_index, batch_size=5)
    kge_harness(model, loader)


================================================
FILE: tests/gnn/nn/kge/test_rotate.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import torch

from torch_geometric.nn import RotatE
from kge_utils import kge_harness


def test_rotate():
    model = RotatE(num_nodes=10, num_relations=5, hidden_channels=32)
    assert str(model) == 'RotatE(10, num_relations=5, hidden_channels=32)'

    head_index = torch.tensor([0, 2, 4, 6, 8])
    rel_type = torch.tensor([0, 1, 2, 3, 4])
    tail_index = torch.tensor([1, 3, 5, 7, 9])

    loader = model.loader(head_index, rel_type, tail_index, batch_size=5)
    kge_harness(model, loader)


================================================
FILE: tests/gnn/nn/kge/test_transe.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import torch

from torch_geometric.nn import TransE
from kge_utils import kge_harness


def test_transe():
    model = TransE(num_nodes=10, num_relations=5, hidden_channels=32)
    assert str(model) == 'TransE(10, num_relations=5, hidden_channels=32)'

    head_index = torch.tensor([0, 2, 4, 6, 8])
    rel_type = torch.tensor([0, 1, 2, 3, 4])
    tail_index = torch.tensor([1, 3, 5, 7, 9])

    loader = model.loader(head_index, rel_type, tail_index, batch_size=5)
    kge_harness(model, loader)


================================================
FILE: tests/gnn/nn/nn_utils.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import copy
import inspect
import math
import torch

import helpers
from torch_geometric.data import Batch, Data
import poptorch

# Need to import poptorch_geometric to ensure that our arg parser implementation is
# registered with poptorch ahead of running these tests
import poptorch_geometric  # pylint: disable=unused-import


class ModelWW(helpers.ModelWithWeights):
    def __init__(self, op, first_input_shape):
        super().__init__(op, first_input_shape)
        self.op = copy.deepcopy(op)
        self.loss_fn = torch.nn.MSELoss()
        self.first_input_shape = first_input_shape
        self.first_input_numel = first_input_shape.numel()
        self.out_fn = torch.nn.Linear(self.first_input_numel,
                                      self.first_input_numel)
        self._weights_before = self.out_fn.weight.detach().clone()

    def forward(self, xs):
        if callable(getattr(self.op, "forward", None)) and isinstance(
                inspect.signature(self.op.forward).return_annotation, tuple):
            x = self.op.forward(*xs)
            l = 0
        else:
            x = self.op(*xs)
            if isinstance(x, (Batch, Data)):
                x1 = torch.flatten(x.x)
            elif isinstance(x, tuple):
                x1 = torch.flatten(x[0])
            else:
                x1 = torch.flatten(x)
            if x1.shape.numel() != self.first_input_numel:
                ratio = math.ceil(self.first_input_numel / x1.shape.numel())
                x1 = x1.repeat(ratio)[:self.first_input_numel]
            if x1.dtype != torch.float:
                x1 = x1.float()
            x1 = x1 if self.out_fn is None else self.out_fn(x1)
            x1 = x1.reshape(self.first_input_shape)
            target = torch.ones_like(x1)
            l = self.loss_fn(x1, target)
        return x, l


def op_harness(op, inputs, assert_func=None, inference=False):
    if isinstance(inputs[0], (Batch, Data)):
        first_input_shape = inputs[0].x.shape
    else:
        first_input_shape = inputs[0].shape

    model = ModelWW(op, first_input_shape)

    # Run on CPU.
    native_out, _ = model(tuple(inputs))

    # Run on IPU.
    if inference:
        poptorch_model = poptorch.inferenceModel(model)
    else:
        # The LR should be large enough that a single training step will
        # definitely cause weights to change
        optim = torch.optim.AdamW(model.parameters(), lr=0.1)
        poptorch_model = poptorch.trainingModel(model, optimizer=optim)

    poptorch_out, _ = poptorch_model(tuple(inputs))

    # Training test - check weights have changed
    poptorch_model.assert_weights_changed()

    if assert_func is not None:
        assert_func(native_out, poptorch_out)

    return poptorch_out


================================================
FILE: tests/gnn/nn/norm/norm_utils.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import torch

import helpers
from torch_geometric.data import Batch, Data

from gnn.nn.nn_utils import op_harness


def assert_(native_out, poptorch_out):
    def check_inner_field(x, y):
        assert isinstance(x, type(y)), \
            f"x type={type(x)} is different than y type={type(y)}"
        if isinstance(x, torch.Tensor):
            helpers.assert_allclose(actual=x,
                                    expected=y,
                                    atol=1e-04,
                                    rtol=1e-04,
                                    equal_nan=True)
        elif isinstance(x, (list, tuple)):
            for t, ct in zip(x, y):
                check_inner_field(t, ct)
        elif isinstance(x, (Batch, Data)):
            assert x.keys == y.keys, "Objects have different keys."
            for k in x.keys:
                check_inner_field(x[k], y[k])
        elif x is not None:
            assert False, f"Unsupported types: x type={type(x)}, y type=" \
                f"{type(y)}"

    check_inner_field(native_out, poptorch_out)


def norm_harness(op, inputs, assert_func=None, inference=False):

    if assert_func is None:
        assert_func = assert_
    poptorch_out = op_harness(op, inputs, assert_func, inference)

    return poptorch_out


================================================
FILE: tests/gnn/nn/norm/test_batch_norm.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import pytest
import torch

from torch_geometric.nn import BatchNorm

from norm_utils import norm_harness


@pytest.mark.parametrize('conf', [True, False])
def test_batch_norm(conf):
    x = torch.randn(100, 16)

    norm = BatchNorm(16, affine=conf, track_running_stats=conf)
    assert str(norm) == 'BatchNorm(16)'

    out = norm_harness(norm, [x])
    assert out.size() == (100, 16)


def test_batch_norm_single_element():
    x = torch.randn(1, 16)

    norm = BatchNorm(16, track_running_stats=True, allow_single_element=True)
    assert str(norm) == 'BatchNorm(16)'

    out = norm_harness(norm, [x], inference=True)
    assert torch.allclose(out, x)


================================================
FILE: tests/gnn/nn/norm/test_diff_group_norm.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import torch

from norm_utils import norm_harness

from torch_geometric.nn import DiffGroupNorm


def test_diff_group_norm():
    x = torch.randn(6, 16)

    norm = DiffGroupNorm(16, groups=4, lamda=0.01)
    assert str(norm) == 'DiffGroupNorm(16, groups=4)'

    out = norm_harness(norm, [x])
    assert out.size() == x.size()


================================================
FILE: tests/gnn/nn/norm/test_graph_norm.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import torch

from norm_utils import norm_harness

from torch_geometric.nn import GraphNorm


def test_graph_norm():
    torch.manual_seed(42)
    x = torch.randn(200, 16)
    batch = torch.arange(4).view(-1, 1).repeat(1, 50).view(-1)
    batch_size = int(batch.max() + 1)

    norm = GraphNorm(16)

    norm_harness(norm, [x])
    norm_harness(norm, [x, batch, batch_size])


================================================
FILE: tests/gnn/nn/norm/test_graph_size_norm.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import torch

from norm_utils import norm_harness

from torch_geometric.nn import GraphSizeNorm


def test_graph_size_norm():
    x = torch.randn(100, 16)
    batch = torch.repeat_interleave(torch.full((10, ), 10, dtype=torch.long))
    batch_size = int(batch.max()) + 1

    norm = GraphSizeNorm()
    assert str(norm) == 'GraphSizeNorm()'

    out = norm_harness(norm, [x, batch, batch_size])
    assert out.size() == (100, 16)


================================================
FILE: tests/gnn/nn/norm/test_instance_norm.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import pytest
import torch
from torch_geometric.nn import InstanceNorm

import helpers
from gnn.nn.nn_utils import ModelWW
import poptorch


@pytest.mark.parametrize('conf', [True, False])
def test_instance_norm(conf):
    atol = None
    rtol = None
    if conf is True:
        # These values are based on torch_nn_test.py file
        # where InstanceNorm is tested from torch package.
        atol = 1e-3
        rtol = 0.05

    nodes_list = torch.randn(5, 100, 16)

    def test_body(inputs):

        norm = InstanceNorm(16, affine=conf, track_running_stats=conf)

        cpu_model = ModelWW(norm, inputs[0][0].shape)
        ipu_model = poptorch.trainingModel(ModelWW(norm, inputs[0][0].shape))

        for x in inputs[0]:
            cpu_out = None
            ipu_out = None
            if len(inputs) > 1:
                model_inputs = [x] + inputs[1:]
                cpu_out = cpu_model(model_inputs)
                ipu_out = ipu_model(model_inputs)
            else:
                cpu_out = cpu_model([x])
                ipu_out = ipu_model([x])
            helpers.assert_allclose(actual=ipu_out[0],
                                    expected=cpu_out[0],
                                    atol=atol,
                                    rtol=rtol)

    test_body([nodes_list])

    batch = torch.zeros(100, dtype=torch.long)
    batch_size = 1
    test_body([nodes_list, batch, batch_size])

    batch[:50] = torch.ones(50, dtype=torch.long)
    batch_size = 2
    test_body([nodes_list, batch, batch_size])


================================================
FILE: tests/gnn/nn/norm/test_layer_norm.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import pytest
import torch

from norm_utils import norm_harness

from torch_geometric.nn import LayerNorm


@pytest.mark.parametrize('affine', [True, False])
@pytest.mark.parametrize('mode', ['graph', 'node'])
def test_layer_norm(affine, mode):
    x = torch.randn(100, 16)

    norm = LayerNorm(16, affine=affine, mode=mode)

    norm_harness(norm, [x])

    batch = torch.zeros(100, dtype=torch.int64)
    batch_size = 1
    norm_harness(norm, [x, batch, batch_size])

    batch_size = 2
    norm_harness(norm, [
        torch.cat([x, x], dim=0),
        torch.cat([batch, batch + 1], dim=0), batch_size
    ])


================================================
FILE: tests/gnn/nn/norm/test_mean_subtraction_norm.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import torch

from norm_utils import norm_harness

from torch_geometric.nn import MeanSubtractionNorm


def test_mean_subtraction_norm_no_batch():
    x = torch.randn(6, 16)

    norm = MeanSubtractionNorm()
    assert str(norm) == 'MeanSubtractionNorm()'

    out = norm_harness(norm, [x])
    assert out.size() == (6, 16)
    assert torch.allclose(out.mean(), torch.tensor(0.), atol=1e-04)


def test_mean_subtraction_norm():
    x = torch.randn(6, 16)
    batch = torch.tensor([0, 0, 1, 1, 1, 2])

    norm = MeanSubtractionNorm()
    assert str(norm) == 'MeanSubtractionNorm()'

    out = norm_harness(norm, [x, batch, 3])
    assert out.size() == (6, 16)
    assert torch.allclose(out[0:2].mean(), torch.tensor(0.), atol=1e-04)
    assert torch.allclose(out[0:2].mean(), torch.tensor(0.), atol=1e-04)


================================================
FILE: tests/gnn/nn/norm/test_msg_norm.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import torch

from norm_utils import norm_harness

from torch_geometric.nn import MessageNorm


def test_message_norm():
    norm = MessageNorm(learn_scale=True)
    assert str(norm) == 'MessageNorm(learn_scale=True)'
    x = torch.randn(100, 16)
    msg = torch.randn(100, 16)

    out = norm_harness(norm, [x, msg])
    assert out.size() == (100, 16)

    norm = MessageNorm(learn_scale=False)
    assert str(norm) == 'MessageNorm(learn_scale=False)'
    out = norm_harness(norm, [x, msg])
    assert out.size() == (100, 16)


================================================
FILE: tests/gnn/nn/norm/test_pair_norm.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import pytest
import torch

from norm_utils import norm_harness

from torch_geometric.nn import PairNorm


@pytest.mark.parametrize('scale_individually', [False, True])
def test_pair_norm_no_batch(scale_individually):
    x = torch.randn(100, 16)

    norm = PairNorm(scale_individually=scale_individually)
    assert str(norm) == 'PairNorm()'

    out1 = norm_harness(norm, [x])
    assert out1.size() == (100, 16)


@pytest.mark.parametrize('scale_individually', [False, True])
def test_pair_norm(scale_individually):
    x = torch.randn(100, 16)
    batch = torch.zeros(100, dtype=torch.long)

    norm = PairNorm(scale_individually=scale_individually)
    assert str(norm) == 'PairNorm()'

    out1 = norm_harness(norm, [x])

    batch_size = 2
    out2 = norm_harness(norm, [
        torch.cat([x, x], dim=0),
        torch.cat([batch, batch + 1], dim=0), batch_size
    ])
    assert torch.allclose(out1, out2[:100], atol=1e-04)
    assert torch.allclose(out1, out2[100:], atol=1e-04)


================================================
FILE: tests/gnn/nn/pool/pool_utils.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import dataclasses
import torch

import helpers
from torch_geometric.data import Batch, Data

from gnn.nn.nn_utils import op_harness


def assert_(native_out, poptorch_out):
    def check_inner_field(x, y):
        assert isinstance(x, type(y)), \
            f"x type={type(x)} is different than y type={type(y)}"
        if isinstance(x, torch.Tensor):
            helpers.assert_allclose(actual=x,
                                    expected=y,
                                    atol=1e-04,
                                    rtol=1e-04,
                                    equal_nan=True)
        elif isinstance(x, (list, tuple)):
            for t, ct in zip(x, y):
                check_inner_field(t, ct)
        elif isinstance(x, (Batch, Data)):
            assert x.keys == y.keys, "Objects have different keys."
            for k in x.keys:
                check_inner_field(x[k], y[k])
        elif dataclasses.is_dataclass(x):
            for att in dir(x):
                x_field = getattr(x, att, None)
                if not callable(x_field) and isinstance(x_field, torch.Tensor):
                    check_inner_field(x_field, getattr(y, att, None))
        elif x is not None:
            assert False, f"Unsupported types: x type={type(x)}, y type=" \
                f"{type(y)}"

    check_inner_field(native_out, poptorch_out)


def pool_harness(op, inputs, assert_func=None):

    if assert_func is None:
        assert_func = assert_
    poptorch_out = op_harness(op, inputs, assert_func)

    return poptorch_out


================================================
FILE: tests/gnn/nn/pool/test_asap.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import pytest
import torch

from torch_geometric.nn import ASAPooling, GCNConv, GraphConv

from pool_utils import pool_harness


@pytest.mark.skip(reason="TODO(AFS-229, AFS-230, AFS-232, AFS-262)")
def test_asap():
    in_channels = 16
    edge_index = torch.tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3],
                               [1, 2, 3, 0, 2, 3, 0, 1, 3, 0, 1, 2]])
    num_nodes = edge_index.max().item() + 1
    x = torch.randn((num_nodes, in_channels))

    for GNN in [GraphConv, GCNConv]:
        pool = ASAPooling(in_channels,
                          ratio=0.5,
                          GNN=GNN,
                          add_self_loops=False)
        assert pool.__repr__() == ('ASAPooling(16, ratio=0.5)')
        out = pool_harness(pool, [x, edge_index])
        assert out[0].size() == (num_nodes // 2, in_channels)
        assert out[1].size() == (2, 2)

        pool = ASAPooling(in_channels, ratio=0.5, GNN=GNN, add_self_loops=True)
        assert pool.__repr__() == ('ASAPooling(16, ratio=0.5)')
        out = pool_harness(pool, [x, edge_index])
        assert out[0].size() == (num_nodes // 2, in_channels)
        assert out[1].size() == (2, 4)

        pool = ASAPooling(in_channels, ratio=2, GNN=GNN, add_self_loops=False)
        assert pool.__repr__() == ('ASAPooling(16, ratio=2)')
        out = pool_harness(pool, [x, edge_index])
        assert out[0].size() == (2, in_channels)
        assert out[1].size() == (2, 2)


================================================
FILE: tests/gnn/nn/pool/test_avg_pool.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import pytest
import torch

from torch_geometric.data import Batch, Data
from torch_geometric.nn import avg_pool, avg_pool_neighbor_x, avg_pool_x

from pool_utils import pool_harness


def test_avg_pool_x():
    cluster = torch.tensor([0, 1, 0, 1, 2, 2])
    x = torch.Tensor([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
    batch = torch.tensor([0, 0, 0, 0, 1, 1])
    batch_size = int(batch.max().item()) + 1

    out, _ = pool_harness(avg_pool_x, [cluster, x, batch, batch_size, 2])
    assert out.tolist() == [[3, 4], [5, 6], [10, 11], [0, 0]]


@pytest.mark.skip(
    reason="avg_pool uses torch.unique instruction which produces "
    "tensor with dynamic shape. This is not supported for Mk2.")
def test_avg_pool():
    cluster = torch.tensor([0, 1, 0, 1, 2, 2])
    x = torch.Tensor([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
    pos = torch.Tensor([[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]])
    edge_index = torch.tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 5],
                               [1, 2, 3, 0, 2, 3, 0, 1, 3, 0, 1, 2, 5, 4]])
    edge_attr = torch.Tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
    batch = torch.tensor([0, 0, 0, 0, 1, 1])

    data = Batch(x=x,
                 pos=pos,
                 edge_index=edge_index,
                 edge_attr=edge_attr,
                 batch=batch)

    data = pool_harness(avg_pool, [cluster, data, lambda x: x])

    assert data.x.tolist() == [[3, 4], [5, 6], [10, 11]]
    assert data.pos.tolist() == [[1, 1], [2, 2], [4.5, 4.5]]
    assert data.edge_index.tolist() == [[0, 1], [1, 0]]
    assert data.edge_attr.tolist() == [4, 4]
    assert data.batch.tolist() == [0, 0, 1]


@pytest.mark.parametrize('input_type', [Data, Batch])
def test_avg_pool_neighbor_x(input_type):
    if input_type == Batch:
        pytest.skip("TODO(AFS-231, AFS-229, AFS-230)")

    x = torch.Tensor([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
    edge_index = torch.tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 5],
                               [1, 2, 3, 0, 2, 3, 0, 1, 3, 0, 1, 2, 5, 4]])
    batch = torch.tensor([0, 0, 0, 0, 1, 1])

    data = input_type(x=x, edge_index=edge_index, batch=batch)

    data = pool_harness(avg_pool_neighbor_x, [data])

    assert data.x.tolist() == [
        [4, 5],
        [4, 5],
        [4, 5],
        [4, 5],
        [10, 11],
        [10, 11],
    ]
    assert data.edge_index.tolist() == edge_index.tolist()


================================================
FILE: tests/gnn/nn/pool/test_consecutive.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import pytest
import torch

from torch_geometric.nn.pool.consecutive import consecutive_cluster

from pool_utils import pool_harness


@pytest.mark.skip(
    reason="consecutive_cluster uses torch.unique instruction which produces "
    "tensor with dynamic shape. This is not supported for Mk2.")
def test_consecutive_cluster():
    src = torch.tensor([8, 2, 10, 15, 100, 1, 100])

    out, perm = pool_harness(consecutive_cluster, [src])
    assert out.tolist() == [2, 1, 3, 4, 5, 0, 5]
    assert perm.tolist() == [5, 1, 0, 2, 3, 6]


================================================
FILE: tests/gnn/nn/pool/test_decimation.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import pytest
import torch

from torch_geometric.nn.pool.decimation import decimation_indices

from pool_utils import pool_harness


@pytest.mark.skip(reason="Algorithm uses tensors with dynamic shapes "
                  "and reads tensor values during runtime")
def test_decimation_basic():
    N_1, N_2 = 4, 6
    decimation_factor = 2
    ptr = torch.tensor([0, N_1, N_1 + N_2])

    idx_decim, ptr_decim = pool_harness(decimation_indices,
                                        [ptr, decimation_factor],
                                        assert_func=lambda x, y: True)

    expected_size = (N_1 // decimation_factor) + (N_2 // decimation_factor)
    assert idx_decim.size(0) == expected_size

    expected = torch.tensor([0, N_1 // decimation_factor, expected_size])
    assert torch.equal(ptr_decim, expected)


@pytest.mark.skip(reason="Algorithm uses tensors with dynamic shapes "
                  "and reads tensor values during runtime")
def test_decimation_single_cloud():
    N_1 = 4
    decimation_factor = 2
    ptr = torch.tensor([0, N_1])

    idx_decim, ptr_decim = pool_harness(decimation_indices,
                                        [ptr, decimation_factor])

    expected_size = N_1 // decimation_factor
    assert idx_decim.size(0) == expected_size
    assert torch.equal(ptr_decim, torch.tensor([0, expected_size]))


@pytest.mark.skip(reason="Algorithm uses tensors with dynamic shapes "
                  "and reads tensor values during runtime")
def test_decimation_almost_empty():
    N_1 = 4
    decimation_factor = 666  # greater than N_1
    ptr = torch.tensor([0, N_1])

    idx_decim, ptr_decim = pool_harness(decimation_indices,
                                        [ptr, decimation_factor])

    assert idx_decim.size(0) == 1
    assert torch.equal(ptr_decim, torch.tensor([0, 1]))


================================================
FILE: tests/gnn/nn/pool/test_edge_pool.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import pytest
import torch

from torch_geometric.nn import EdgePooling
from torch_geometric.utils import scatter

from pool_utils import pool_harness


def test_compute_edge_score_softmax():
    edge_index = torch.tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 5],
                               [1, 2, 3, 0, 2, 3, 0, 1, 3, 0, 1, 2, 5, 4]])
    raw = torch.randn(edge_index.size(1))
    e = pool_harness(EdgePooling.compute_edge_score_softmax,
                     [raw, edge_index, 6])
    assert torch.all(e >= 0) and torch.all(e <= 1)

    # Test whether all incoming edge scores sum up to one.
    assert torch.allclose(scatter(e, edge_index[1], reduce='sum'),
                          torch.Tensor([1, 1, 1, 1, 1, 1]))


def test_compute_edge_score_tanh():
    edge_index = torch.tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 5],
                               [1, 2, 3, 0, 2, 3, 0, 1, 3, 0, 1, 2, 5, 4]])
    raw = torch.randn(edge_index.size(1))
    e = pool_harness(EdgePooling.compute_edge_score_tanh, [raw, edge_index, 6])
    assert torch.all(e >= -1) and torch.all(e <= 1)
    assert torch.all(torch.argsort(raw) == torch.argsort(e))


def test_compute_edge_score_sigmoid():
    edge_index = torch.tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 5],
                               [1, 2, 3, 0, 2, 3, 0, 1, 3, 0, 1, 2, 5, 4]])
    raw = torch.randn(edge_index.size(1))
    e = pool_harness(EdgePooling.compute_edge_score_sigmoid,
                     [raw, edge_index, 6])
    assert torch.all(e >= 0) and torch.all(e <= 1)
    assert torch.all(torch.argsort(raw) == torch.argsort(e))


@pytest.mark.skip(
    reason="Currently not possible to run on Mk2 due to the algorithm "
    "used in the __merge_edges__ function")
def test_edge_pooling():
    x = torch.Tensor([[0], [1], [2], [3], [4], [5], [-1]])
    edge_index = torch.tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 5, 6],
                               [1, 2, 3, 0, 2, 3, 0, 1, 3, 0, 1, 2, 5, 4, 0]])
    batch = torch.tensor([0, 0, 0, 0, 1, 1, 0])

    op = EdgePooling(in_channels=1)
    assert str(op) == 'EdgePooling(1)'

    # Setting parameters fixed so we can test the expected outcome:
    op.lin.weight.data.fill_(1.)
    op.lin.bias.data.fill_(0.)

    # Test pooling:
    new_x, new_edge_index, new_batch, _ = pool_harness(op,
                                                       [x, edge_index, batch])

    assert new_x.size(0) == new_batch.size(0) == 4
    assert new_edge_index.tolist() == [[0, 1, 1, 2, 2, 3], [0, 1, 2, 1, 2, 2]]
    assert new_batch.tolist() == [1, 0, 0, 0]

    # Test edge cases.
    x = torch.Tensor([[0], [1], [2], [3], [4], [5]])
    edge_index = torch.tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 5],
                               [1, 2, 3, 0, 2, 3, 0, 1, 3, 0, 1, 2, 5, 4]])
    batch = torch.tensor([0, 0, 0, 0, 1, 1])
    new_x, new_edge_index, new_batch, _ = pool_harness(op,
                                                       [x, edge_index, batch])

    assert new_x.size(0) == new_batch.size(0) == 3
    assert new_batch.tolist() == [1, 0, 0]
    assert new_edge_index.tolist() == [[0, 1, 1, 2, 2], [0, 1, 2, 1, 2]]


def test_edge_unpooling():

    x = torch.Tensor([[0], [1], [2], [3], [4], [5], [-1]])
    edge_index = torch.tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 5, 6],
                               [1, 2, 3, 0, 2, 3, 0, 1, 3, 0, 1, 2, 5, 4, 0]])
    batch = torch.tensor([0, 0, 0, 0, 1, 1, 0])

    op = EdgePooling(in_channels=1)
    assert str(op) == 'EdgePooling(1)'

    # Setting parameters fixed so we can test the expected outcome:
    op.lin.weight.data.fill_(1.)
    op.lin.bias.data.fill_(0.)

    # Test pooling:
    new_x, _, _, unpool_info = op(x, edge_index, batch)

    out = pool_harness(op.unpool, [new_x, unpool_info])
    assert out[0].size() == x.size()
    assert out[0].tolist() == [[1], [1], [5], [5], [9], [9], [-1]]
    assert torch.equal(out[1], edge_index)
    assert torch.equal(out[2], batch)


================================================
FILE: tests/gnn/nn/pool/test_fps.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

from copy import deepcopy
import numpy as np
import torch
import torch_cluster
from torch_geometric.nn import Linear
import pytest
import poptorch


class FpsInferModel(torch.nn.Module):
    def forward(self, x, ptr, ratio, random_start):
        return poptorch.fps(x, ptr, ratio, random_start)


class FpsTrainModel(torch.nn.Module):
    def __init__(self, op, linear):
        super().__init__()
        self.loss_fn = torch.nn.MSELoss()
        self.linear = linear
        self.op = op

    def forward(self, x, ptr, ratio, random_start):
        result = self.op(x, ptr, ratio, random_start)
        res = result.float()
        result = self.linear(res)
        target = torch.ones_like(result)
        loss = self.loss_fn(result, target)
        return result, loss


@pytest.mark.parametrize('src_shape', [(1, 2), (2, 19), (3, 10), (19, 3)])
@pytest.mark.parametrize('ratio', [0.3, 0.5, 1.0])
def test_single_batch(src_shape, ratio):
    src = torch.rand(src_shape)
    ptr = [0, src_shape[0]]
    batch = torch.zeros(src_shape[0], dtype=torch.long)

    inference_model = poptorch.inferenceModel(FpsInferModel())
    ipu_res = inference_model(src, ptr, ratio, random_start=False)
    ref_res = torch_cluster.fps(src, batch, ratio, random_start=False)

    assert all(ipu_res == ref_res)


@pytest.mark.parametrize('src_shape', [(19, 3)])
@pytest.mark.parametrize(
    'ptr', [[0, 13, 19], [0, 2, 3, 4, 9, 11, 19], [0, 1, 3, 4, 9, 18, 19]])
@pytest.mark.parametrize('ratio', [0.4, 0.6, 1.0])
def test_multi_batch(src_shape, ptr, ratio):
    src = torch.rand(src_shape)
    batch = torch.zeros(src_shape[0], dtype=torch.long)
    for i in range(1, len(ptr)):
        batch[ptr[i - 1]:ptr[i]] = i - 1

    inference_model = poptorch.inferenceModel(FpsInferModel())
    ipu_res = inference_model(src, ptr, ratio, random_start=False)
    ref_res = torch_cluster.fps(src, batch, ratio, random_start=False)

    assert all(ipu_res == ref_res)


@pytest.mark.parametrize('src_shape', [(29, 3)])
@pytest.mark.parametrize('ptr', [[0, 29], [0, 2, 6, 11, 28, 29]])
@pytest.mark.parametrize('ratio', [1.0])
def test_random_start(src_shape, ptr, ratio):
    src = torch.rand(src_shape)
    batch = torch.zeros(src_shape[0], dtype=torch.long)
    for i in range(1, len(ptr)):
        batch[ptr[i - 1]:ptr[i]] = i - 1

    inference_model = poptorch.inferenceModel(FpsInferModel())
    ipu_res = inference_model(src, ptr, ratio, random_start=True)
    ref_res = torch_cluster.fps(src, batch, ratio, random_start=True)

    for i in range(1, len(ptr)):
        ipu_res_slice = set(ipu_res[ptr[i - 1]:ptr[i]].tolist())
        ref_res_slice = set(ref_res[ptr[i - 1]:ptr[i]].tolist())
        assert ipu_res_slice == ref_res_slice


@pytest.mark.parametrize('src_shape', [(29, 3)])
@pytest.mark.parametrize('ptr', [[0, 29], [0, 2, 6, 11, 28, 29]])
@pytest.mark.parametrize('ratio', [0.15, 0.7, 1.0])
def test_train(src_shape, ptr, ratio):
    src = torch.rand(src_shape)
    batch = torch.zeros(src_shape[0], dtype=torch.long)
    for i in range(1, len(ptr)):
        batch[ptr[i - 1]:ptr[i]] = i - 1

    deg = np.subtract(ptr[1:], ptr[0:-1])
    out_size = np.ceil(deg * ratio).astype(int)
    out_size = np.cumsum(out_size, 0)[-1]

    linear_ipu = Linear(out_size, out_size)
    linear_ref = deepcopy(linear_ipu)

    ipu_model = FpsTrainModel(poptorch.fps, linear_ipu)
    ipu_model = poptorch.trainingModel(ipu_model)
    ipu_res, ipu_loss = ipu_model(src, ptr, ratio, random_start=False)

    ref_model = FpsTrainModel(torch_cluster.fps, linear_ref)
    ref_res, ref_loss = ref_model(src, batch, ratio, random_start=False)

    rtol = 1e-05
    atol = 1e-06
    assert np.allclose(ipu_res.tolist(),
                       ref_res.tolist(),
                       rtol=rtol,
                       atol=atol)
    assert np.allclose(ipu_loss.tolist(),
                       ref_loss.tolist(),
                       rtol=rtol,
                       atol=atol)


================================================
FILE: tests/gnn/nn/pool/test_glob.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import pytest
import torch

from torch_geometric.nn import (
    global_add_pool,
    global_max_pool,
    global_mean_pool,
)

from pool_utils import pool_harness


def test_global_pool():
    N_1, N_2 = 4, 6
    x = torch.randn(N_1 + N_2, 4)
    batch = torch.tensor([0 for _ in range(N_1)] + [1 for _ in range(N_2)])

    out = pool_harness(global_add_pool, [x, batch, 2])
    assert out.size() == (2, 4)
    torch.testing.assert_close(out[0], x[:4].sum(dim=0))
    torch.testing.assert_close(out[1], x[4:].sum(dim=0))

    out = pool_harness(global_add_pool, [x, None])
    assert out.size() == (1, 4)
    torch.testing.assert_close(out, x.sum(dim=0, keepdim=True))

    out = pool_harness(global_mean_pool, [x, batch, 2])
    assert out.size() == (2, 4)
    torch.testing.assert_close(out[0], x[:4].mean(dim=0))
    torch.testing.assert_close(out[1], x[4:].mean(dim=0))

    out = pool_harness(global_mean_pool, [x, None])
    assert out.size() == (1, 4)
    torch.testing.assert_close(out, x.mean(dim=0, keepdim=True))

    out = pool_harness(global_max_pool, [x, batch, 2])
    assert out.size() == (2, 4)
    torch.testing.assert_close(out[0], x[:4].max(dim=0)[0])
    torch.testing.assert_close(out[1], x[4:].max(dim=0)[0])


@pytest.mark.skip(reason="TODO(AFS-140)")
def test_global_max_pool_no_batch():
    N_1, N_2 = 4, 6
    x = torch.randn(N_1 + N_2, 4)

    out = pool_harness(global_max_pool, [x, None])
    assert out.size() == (1, 4)
    torch.testing.assert_close(out, x.max(dim=0, keepdim=True)[0])


def test_permuted_global_pool():
    N_1, N_2 = 4, 6
    x = torch.randn(N_1 + N_2, 4)
    batch = torch.cat([torch.zeros(N_1), torch.ones(N_2)]).to(torch.long)
    perm = torch.randperm(N_1 + N_2)

    px = x[perm]
    pbatch = batch[perm]
    px1 = px[pbatch == 0]
    px2 = px[pbatch == 1]

    out = pool_harness(global_add_pool, [px, pbatch, 2])
    assert out.size() == (2, 4)
    assert torch.allclose(out[0], px1.sum(dim=0))
    assert torch.allclose(out[1], px2.sum(dim=0))

    out = pool_harness(global_mean_pool, [px, pbatch, 2])
    assert out.size() == (2, 4)
    assert torch.allclose(out[0], px1.mean(dim=0))
    assert torch.allclose(out[1], px2.mean(dim=0))

    out = pool_harness(global_max_pool, [px, pbatch, 2])
    assert out.size() == (2, 4)
    assert torch.allclose(out[0], px1.max(dim=0)[0])
    assert torch.allclose(out[1], px2.max(dim=0)[0])


def test_dense_global_pool():
    x = torch.randn(3, 16, 32)
    out = pool_harness(global_add_pool, [x, None])
    assert torch.allclose(out, x.sum(dim=1))


================================================
FILE: tests/gnn/nn/pool/test_graclus.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import pytest
import torch

from torch_geometric.nn import graclus
from torch_geometric.testing import withPackage

from pool_utils import pool_harness


@pytest.mark.skip(reason="TODO(AFS-245)")
@withPackage('torch_cluster')
def test_graclus():
    edge_index = torch.tensor([[0, 1], [1, 0]])
    weight = torch.tensor([1., 1.])
    out = pool_harness(graclus, [edge_index, weight, 2])
    assert out.tolist() == [0, 0]


================================================
FILE: tests/gnn/nn/pool/test_max_pool.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import pytest
import torch

from torch_geometric.data import Batch, Data
from torch_geometric.nn import max_pool, max_pool_neighbor_x, max_pool_x

from pool_utils import pool_harness


def test_max_pool_x():
    cluster = torch.tensor([0, 1, 0, 1, 2, 2])
    x = torch.Tensor([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
    batch = torch.tensor([0, 0, 0, 0, 1, 1])
    batch_size = int(batch.max().item()) + 1

    out, _ = pool_harness(max_pool_x, [cluster, x, batch, batch_size, 2])
    assert out.tolist() == [[5, 6], [7, 8], [11, 12], [0, 0]]


@pytest.mark.skip(
    reason="max_pool uses torch.unique instruction which produces "
    "tensor with dynamic shape. This is not supported for Mk2.")
def test_max_pool():
    cluster = torch.tensor([0, 1, 0, 1, 2, 2])
    x = torch.Tensor([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
    pos = torch.Tensor([[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]])
    edge_index = torch.tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 5],
                               [1, 2, 3, 0, 2, 3, 0, 1, 3, 0, 1, 2, 5, 4]])
    edge_attr = torch.Tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
    batch = torch.tensor([0, 0, 0, 0, 1, 1])

    data = Batch(x=x,
                 pos=pos,
                 edge_index=edge_index,
                 edge_attr=edge_attr,
                 batch=batch)

    data = pool_harness(max_pool, [cluster, data, lambda x: x])

    assert data.x.tolist() == [[5, 6], [7, 8], [11, 12]]
    assert data.pos.tolist() == [[1, 1], [2, 2], [4.5, 4.5]]
    assert data.edge_index.tolist() == [[0, 1], [1, 0]]
    assert data.edge_attr.tolist() == [4, 4]
    assert data.batch.tolist() == [0, 0, 1]


@pytest.mark.parametrize('input_type', [Data, Batch])
def test_max_pool_neighbor_x(input_type):
    if input_type == Batch:
        pytest.skip("TODO(AFS-231, AFS-229, AFS-230)")

    x = torch.Tensor([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
    edge_index = torch.tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 5],
                               [1, 2, 3, 0, 2, 3, 0, 1, 3, 0, 1, 2, 5, 4]])
    batch = torch.tensor([0, 0, 0, 0, 1, 1])

    data = input_type(x=x, edge_index=edge_index, batch=batch)
    data = pool_harness(max_pool_neighbor_x, [data])

    assert data.x.tolist() == [
        [7, 8],
        [7, 8],
        [7, 8],
        [7, 8],
        [11, 12],
        [11, 12],
    ]
    assert data.edge_index.tolist() == edge_index.tolist()


================================================
FILE: tests/gnn/nn/pool/test_mem_pool.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import torch

from torch_geometric.nn import MemPooling
from torch_geometric.utils import to_dense_batch

from pool_utils import pool_harness

import helpers
import poptorch


def test_mem_pool_basic():
    torch.manual_seed(42)

    mpool1 = MemPooling(4, 8, heads=3, num_clusters=2)
    assert mpool1.__repr__() == 'MemPooling(4, 8, heads=3, num_clusters=2)'

    x = torch.randn(17, 4)
    batch = torch.tensor([0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4])
    _, mask = to_dense_batch(x, batch)

    batch_size = int(batch.max() + 1)
    out1, S = pool_harness(mpool1, [x, batch, None, 4, batch_size])
    assert out1.size() == (5, 2, 8)
    assert S[~mask].sum() == 0
    assert round(S[mask].sum().item()) == x.size(0)


def test_mem_pool_basic_custom_loss():
    torch.manual_seed(42)

    x = torch.randn(17, 4)
    batch = torch.tensor([0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4])

    class MemPoolWrapper(torch.nn.Module):
        def __init__(self, *args, **kwargs) -> None:
            super().__init__(*args, **kwargs)
            self.op = MemPooling(4, 8, heads=3, num_clusters=2)

        def forward(self, *args, **kwargs):
            out1, S = self.op.forward(*args, **kwargs)
            loss = MemPooling.kl_loss(S)
            return out1, poptorch.identity_loss(loss, "sum")

    model = MemPoolWrapper()
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
    poptorch_model = poptorch.trainingModel(model, optimizer=optimizer)
    batch_size = int(batch.max() + 1)
    out1_expected, _ = model(x, batch, None, 4, batch_size)
    out1, loss = poptorch_model(x, batch, None, 4, batch_size)

    assert float(loss) > 0
    assert out1.size() == (5, 2, 8)
    helpers.assert_allclose(actual=out1, expected=out1_expected)


def test_mem_pool_chain():
    torch.manual_seed(42)

    mpool1 = MemPooling(4, 8, heads=3, num_clusters=2)
    assert mpool1.__repr__() == 'MemPooling(4, 8, heads=3, num_clusters=2)'
    mpool2 = MemPooling(8, 4, heads=2, num_clusters=1)
    assert mpool2.__repr__() == 'MemPooling(8, 4, heads=2, num_clusters=1)'

    x = torch.randn(17, 4)
    batch = torch.tensor([0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4])

    out1, _ = mpool1(x, batch)
    assert out1.size() == (5, 2, 8)
    out2, _ = pool_harness(mpool2, [out1])
    assert out2.size() == (5, 1, 4)


================================================
FILE: tests/gnn/nn/pool/test_pan_pool.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import pytest
import torch

from torch_geometric.nn import PANConv, PANPooling

from pool_utils import pool_harness


@pytest.mark.skip(reason="The class is using filter_adj which produces "
                  "tensors with dynamic shapes. It is not supported "
                  "on Mk2.")
def test_pan_pooling():
    edge_index = torch.tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3],
                               [1, 2, 3, 0, 2, 3, 0, 1, 3, 0, 1, 2]])
    num_nodes = edge_index.max().item() + 1
    x = torch.randn((num_nodes, 16))

    conv = PANConv(16, 32, filter_size=2)
    pool = PANPooling(32, ratio=0.5)
    assert str(pool) == 'PANPooling(32, ratio=0.5, multiplier=1.0)'

    x, M = conv(x, edge_index)
    row, col, edge_weight = M.coo()
    h, edge_index, edge_weight, _, perm, score = pool_harness(
        pool, [x, row, col, edge_weight])

    assert h.size() == (2, 32)
    assert edge_index.size() == (2, 4)
    assert edge_weight.size() == (4, )
    assert perm.size() == (2, )
    assert score.size() == (2, )


================================================
FILE: tests/gnn/nn/pool/test_pool_knn.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import torch
from torch_geometric.nn import knn, knn_graph

import helpers
import poptorch


class KnnModel(torch.nn.Module):
    def __init__(self, op) -> None:
        super().__init__()
        self.op = op

    def forward(self, *args, **kwargs):
        return self.op(*args, **kwargs)


def test_knn():
    x = torch.Tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]])
    batch_x = torch.tensor([0, 0, 0, 0])
    y = torch.Tensor([[-1, 0], [1, 0]])
    batch_y = torch.tensor([0, 0])

    assign_index_cpu = knn(x, y, 2, batch_x, batch_y)

    model = poptorch.inferenceModel(KnnModel(knn))
    assign_index_ipu = model(x, y, 2, batch_x, batch_y)

    # There is no guarantee that indexes that knn returns must be in any
    # particualr order if there are multiple identical elements so we can't
    # compare results directly as one can be permutation of the other.
    helpers.assert_allequal(actual=assign_index_ipu.sort()[0],
                            expected=assign_index_cpu.sort()[0])


def test_knn_graph():
    x = torch.Tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]])
    batch = torch.tensor([0, 0, 0, 0])

    edge_index_cpu = knn_graph(x, k=2, batch=batch, loop=True)
    model = poptorch.inferenceModel(KnnModel(knn_graph))
    edge_index_ipu = model(x, k=2, batch=batch, loop=True)

    # There is no guarantee that indexes that knn returns must be in any
    # particualr order if there are multiple identical elements so we can't
    # compare results directly as one can be permutation of the other.
    helpers.assert_allequal(actual=edge_index_cpu.sort()[0],
                            expected=edge_index_ipu.sort()[0])


================================================
FILE: tests/gnn/nn/pool/test_radius.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

from typing import Optional

import torch
import torch_geometric
from torch import Tensor
import poptorch


def to_set(edge_index):
    # pylint: disable=R1721
    return {(i, j) for i, j in edge_index.t().tolist()}


def assert_fn(native_out, poptorch_out):
    poptorch_out = poptorch_out[poptorch_out != -1]
    dim = poptorch_out.size(0) // 2
    poptorch_out = poptorch_out.reshape((2, dim))

    native_out = native_out[native_out != -1]
    dim = native_out.size(0) // 2
    native_out = native_out.reshape((2, dim))

    assert to_set(poptorch_out) == to_set(native_out)


def op_harness(*args, **kwargs):
    class Model(torch.nn.Module):
        def forward(self, x: Tensor, batch: Optional[Tensor] = None) -> Tensor:
            return torch_geometric.nn.radius_graph(x,
                                                   r=2.5,
                                                   batch=batch,
                                                   loop=True)

    native_out = Model()(*args, **kwargs)
    model = poptorch.inferenceModel(Model())
    poptorch_out = model(*args, **kwargs)
    assert_fn(native_out, poptorch_out)


def test_radius_graph():

    x = torch.tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]], dtype=torch.float)
    batch = torch.tensor([0, 0, 0, 0])

    op_harness(x, batch)


================================================
FILE: tests/gnn/nn/pool/test_sag_pool.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import pytest
import torch

from torch_geometric.nn import (
    GATConv,
    GCNConv,
    GraphConv,
    SAGEConv,
    SAGPooling,
)

from pool_utils import pool_harness


@pytest.mark.skip(reason="The class is using filter_adj which produces "
                  "tensors with dynamic shapes. It is not supported "
                  "on Mk2.")
@pytest.mark.parametrize('GNN', [GraphConv, GCNConv, GATConv, SAGEConv])
def test_sag_pooling(GNN):
    conv_kwargs = {'add_self_loops': False}

    in_channels = 16
    edge_index = torch.tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3],
                               [1, 2, 3, 0, 2, 3, 0, 1, 3, 0, 1, 2]])
    num_nodes = edge_index.max().item() + 1
    x = torch.randn((num_nodes, in_channels))

    pool1 = SAGPooling(in_channels, ratio=0.5, GNN=GNN, **conv_kwargs)
    out1 = pool_harness(pool1, [x, edge_index])
    assert out1[0].size() == (num_nodes // 2, in_channels)
    assert out1[1].size() == (2, 2)

    pool2 = SAGPooling(in_channels,
                       ratio=None,
                       GNN=GNN,
                       min_score=0.1,
                       **conv_kwargs)
    out2 = pool_harness(pool2, [x, edge_index])
    assert out2[0].size(0) <= x.size(0) and out2[0].size(1) == (16)
    assert out2[1].size(0) == 2 and out2[1].size(1) <= edge_index.size(1)

    pool3 = SAGPooling(in_channels, ratio=2, GNN=GNN, **conv_kwargs)
    out3 = pool_harness(pool3, [x, edge_index])
    assert out3[0].size() == (2, in_channels)
    assert out3[1].size() == (2, 2)


================================================
FILE: tests/gnn/nn/pool/test_select_topk.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import pytest
import torch

from torch_geometric.nn.pool.select import SelectOutput, SelectTopK
from torch_geometric.nn.pool.select.topk import topk
from pool_utils import pool_harness


@pytest.mark.skip(
    reason=
    "Inside the topk function, an index tensor is created that causes the "
    "output tensor to dynamically shape. It is not supported on MK2.")
def test_topk_ratio():
    x = torch.Tensor([2, 4, 5, 6, 2, 9])
    batch = torch.tensor([0, 0, 1, 1, 1, 1])

    perm1 = pool_harness(topk, [x, 0.5, batch])
    assert perm1.tolist() == [1, 5, 3]
    assert x[perm1].tolist() == [4, 9, 6]
    assert batch[perm1].tolist() == [0, 1, 1]

    perm2 = pool_harness(topk, [x, 2, batch])
    assert perm2.tolist() == [1, 0, 5, 3]
    assert x[perm2].tolist() == [4, 2, 9, 6]
    assert batch[perm2].tolist() == [0, 0, 1, 1]

    perm3 = pool_harness(topk, [x, 3, batch])
    assert perm3.tolist() == [1, 0, 5, 3, 2]
    assert x[perm3].tolist() == [4, 2, 9, 6, 5]
    assert batch[perm3].tolist() == [0, 0, 1, 1, 1]


@pytest.mark.skip(
    reason=
    "Inside the topk function, an index tensor is created that causes the "
    "output tensor to dynamically shape. It is not supported on MK2.")
@pytest.mark.parametrize('min_score', [None, 2.0])
def test_select_topk(min_score):
    if min_score is not None:
        return
    x = torch.randn(6, 16)
    batch = torch.tensor([0, 0, 1, 1, 1, 1])

    pool = SelectTopK(16, min_score=min_score)

    if min_score is None:
        assert str(pool) == 'SelectTopK(16, ratio=0.5)'
    else:
        assert str(pool) == 'SelectTopK(16, min_score=2.0)'

    out = pool_harness(pool, [x, batch])
    assert isinstance(out, SelectOutput)
    assert out.num_nodes == 6
    assert out.num_clusters <= out.num_nodes
    assert out.node_index.min() >= 0
    assert out.node_index.max() < out.num_nodes
    assert out.cluster_index.min() == 0
    assert out.cluster_index.max() == out.num_clusters - 1


================================================
FILE: tests/gnn/nn/pool/test_topk_pool.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import pytest
import torch
from torch_geometric.nn.pool import TopKPooling
from torch_geometric.nn.pool.topk_pool import filter_adj

from pool_utils import pool_harness


@pytest.mark.skip(reason="The class is using filter_adj which produces "
                  "tensors with dynamic shapes. It is not supported "
                  "on Mk2.")
def test_filter_adj():
    edge_index = torch.tensor([[0, 0, 1, 1, 2, 2, 3, 3],
                               [1, 3, 0, 2, 1, 3, 0, 2]])
    edge_attr = torch.Tensor([1, 2, 3, 4, 5, 6, 7, 8])
    perm = torch.tensor([1, 2, 3])

    out = pool_harness(filter_adj, [edge_index, edge_attr, perm, 4])
    assert out[0].tolist() == [[0, 1], [1, 0]]
    assert out[1].tolist() == [6, 8]


@pytest.mark.skip(reason="The class is using filter_adj which produces "
                  "tensors with dynamic shapes. It is not supported "
                  "on Mk2.")
def test_topk_pooling():
    in_channels = 16
    edge_index = torch.tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3],
                               [1, 2, 3, 0, 2, 3, 0, 1, 3, 0, 1, 2]])
    num_nodes = edge_index.max().item() + 1
    x = torch.randn((num_nodes, in_channels))

    pool1 = TopKPooling(in_channels, ratio=0.5)
    assert str(pool1) == 'TopKPooling(16, ratio=0.5, multiplier=1.0)'
    out1 = pool_harness(pool1, [x, edge_index])
    assert out1[0].size() == (num_nodes // 2, in_channels)
    assert out1[1].size() == (2, 2)

    pool2 = TopKPooling(in_channels, ratio=None, min_score=0.1)
    assert str(pool2) == 'TopKPooling(16, min_score=0.1, multiplier=1.0)'
    out2 = pool_harness(pool2, [x, edge_index])
    assert out2[0].size(0) <= x.size(0) and out2[0].size(1) == (16)
    assert out2[1].size(0) == 2 and out2[1].size(1) <= edge_index.size(1)

    pool3 = TopKPooling(in_channels, ratio=2)
    assert str(pool3) == 'TopKPooling(16, ratio=2, multiplier=1.0)'
    out3 = pool_harness(pool3, [x, edge_index])
    assert out3[0].size() == (2, in_channels)
    assert out3[1].size() == (2, 2)


================================================
FILE: tests/gnn/nn/pool/test_voxel_grid.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import torch

from torch_geometric.data import Batch
from torch_geometric.nn import avg_pool, voxel_grid
from torch_geometric.testing import withPackage

from pool_utils import pool_harness


@withPackage('torch_cluster')
def test_voxel_grid():
    pos = torch.Tensor([[0, 0], [11, 9], [2, 8], [2, 2], [8, 3]])
    batch = torch.tensor([0, 0, 0, 1, 1])

    out = pool_harness(voxel_grid, [pos, 5, batch])
    assert out.tolist() == [0, 5, 3, 6, 7]
    out = pool_harness(voxel_grid, [pos, 5])
    assert out.tolist() == [0, 5, 3, 0, 1]


@withPackage('torch_cluster')
def test_voxel_grid_with_optional_args():
    pos = torch.Tensor([[0, 0], [11, 9], [2, 8], [2, 2], [8, 3]])
    batch = torch.tensor([0, 0, 0, 1, 1])

    cluster = pool_harness(voxel_grid, [pos, 5, batch, -1, [18, 14]])
    assert cluster.tolist() == [0, 10, 4, 16, 17]

    cluster_no_batch = pool_harness(voxel_grid, [pos, 5, None, -1, [18, 14]])
    assert cluster_no_batch.tolist() == [0, 10, 4, 0, 1]


@withPackage('torch_cluster')
def test_single_voxel_grid():
    pos = torch.Tensor([[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]])
    edge_index = torch.tensor([[0, 0, 3], [1, 2, 4]])
    batch = torch.tensor([0, 0, 0, 1, 1])
    x = torch.randn(5, 16)

    cluster = pool_harness(voxel_grid, [pos, 5, batch])
    assert cluster.tolist() == [0, 0, 0, 1, 1]

    data = Batch(x=x, edge_index=edge_index, pos=pos, batch=batch)
    data = avg_pool(cluster, data)

    cluster_no_batch = pool_harness(voxel_grid, [pos, 5])
    assert cluster_no_batch.tolist() == [0, 0, 0, 0, 0]


================================================
FILE: tests/gnn/nn/test_linear.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

from itertools import product

import pytest
import torch
from torch_geometric.nn import HeteroLinear, Linear

from dense.dense_utils import dense_harness

weight_inits = ['glorot', "uniform", 'kaiming_uniform', None]
bias_inits = ['zeros', None]


@pytest.mark.parametrize('weight,bias', product(weight_inits, bias_inits))
def test_linear(weight, bias):
    lin = Linear(16, 32, weight_initializer=weight, bias_initializer=bias)
    x = torch.randn(1, 4, 16)

    dense_harness(lin, x)


@pytest.mark.parametrize('with_bias', [True, False])
def test_hetero_linear(with_bias):
    x = torch.randn(10, 16)
    type_vec = torch.tensor([0, 0, 2, 1, 0, 2, 2, 2, 1, 2])

    lin = HeteroLinear(16, 32, num_types=3, bias=with_bias)

    dense_harness(lin, (x, type_vec))


================================================
FILE: tests/gnn/nn/test_loss.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import pytest
import torch

from torch.nn import L1Loss, MSELoss

from poptorch_geometric import TrainingStepper


def loss_harness(in_channels,
                 out_channels,
                 cpu_dataloader=None,
                 ipu_dataloader=None,
                 loss_fn=None,
                 num_steps=4,
                 atol=5e-4,
                 rtol=5e-4):
    class LinearModel(torch.nn.Module):
        def __init__(self, loss_fn):
            assert loss_fn is not None
            assert hasattr(loss_fn, 'reduction')
            super().__init__()
            self.loss = loss_fn
            self.linear = torch.nn.Linear(in_channels, out_channels)

        def forward(self, *args):
            x = args[0]
            nodes_mask = args[1]
            target = args[2]

            result = self.linear(x)
            # Apply nodes mask, so that the loss may be computed properly
            if nodes_mask is not None:
                result[~nodes_mask] = 0

            if self.training:
                # target = torch.ones_like(result)
                if nodes_mask is not None:
                    target[~nodes_mask] = 0

                loss = self.loss(result, target)
                # In case, the loss function applies mean reduction, the result
                # has to be rescaled by the effective size of the batch
                # (excluding padding).
                if nodes_mask is not None and self.loss.reduction == 'mean':
                    size = nodes_mask.shape[0]
                    real_size = torch.count_nonzero(nodes_mask)
                    loss = loss * size / real_size

                return (result, loss)
            return result

    model = LinearModel(loss_fn)
    stepper = TrainingStepper(model, atol=atol, rtol=rtol)

    if cpu_dataloader is not None and ipu_dataloader is not None:
        for step, (cpu_batch,
                   ipu_batch) in enumerate(zip(cpu_dataloader,
                                               ipu_dataloader)):
            if step == num_steps:
                break
            stepper.run(1, (cpu_batch.x, None,
                            torch.ones(cpu_batch.x.shape[0], out_channels)),
                        (ipu_batch.x, ipu_batch.nodes_mask,
                         torch.ones(ipu_batch.x.shape[0], out_channels)))


@pytest.mark.parametrize('loss_fn', [
    L1Loss,
    MSELoss,
])
@pytest.mark.parametrize('reduction', ['mean', 'sum'])
def test_loss_fixedsize_vs_regular_dataloader(loss_fn, reduction, dataloader,
                                              fixed_size_dataloader):
    first_sample = next(iter(dataloader))
    in_channels = first_sample.num_node_features
    out_channels = in_channels * 2

    loss_harness(in_channels,
                 out_channels,
                 cpu_dataloader=dataloader,
                 ipu_dataloader=fixed_size_dataloader,
                 loss_fn=loss_fn(reduction=reduction))


================================================
FILE: tests/gnn/nn/test_mish.py
================================================
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.

from copy import deepcopy
import pytest
import torch

from torch_geometric.nn import Linear
from poptorch import inferenceModel, trainingModel


class MishReference(torch.nn.Module):
    def forward(self, x):
        return x * torch.tanh(torch.nn.functional.softplus(x))


class MishTrainModel(torch.nn.Module):
    def __init__(self, op, linear):
        super().__init__()
        self.loss_fn = torch.nn.MSELoss()
        self.linear = linear
        self.op = op

    def forward(self, x):
        result = self.op(x)
        res = result.float()
        result = self.linear(res)
        target = torch.ones_like(result)
        loss = self.loss_fn(result, target)
        return result, loss


@pytest.mark.parametrize('size', [(13, ), (1, 64, 320, 320)])
def test_mish(size):
    x = torch.rand(size)
    ipu_model = inferenceModel(torch.nn.Mish())
    ipu_res = ipu_model(x)

    ref_ipu_model = inferenceModel(MishReference())
    ref_ipu_res = ref_ipu_model(x)

    ref_model = torch.nn.Mish()
    ref_res = ref_model(x)

    torch.allclose(ipu_res, ref_ipu_res)
    torch.allclose(ipu_res, ref_res)


@pytest.mark.parametrize('size', [(11, ), (1, 64, 128)])
def test_mish_training(size):
    x = torch.rand(size)
    linear_ipu = Linear(size[-1], size[-1])
    linear_ref = deepcopy(linear_ipu)
    model = MishTrainModel(torch.nn.Mish(), linear_ipu)

    ref_res, ref_loss = model(x)

    ipu_model = trainingModel(model)
    ipu_res, ipu_loss = ipu_model(x)

    model = MishTrainModel(MishReference(), linear_ref)
    ref_ipu_model = trainingModel(model)
    ref_ipu_res, ref_ipu_loss = ref_ipu_model(x)

    torch.allclose(ipu_res, ref_res)
    torch.allclose(ipu_loss, ref_loss)
    torch.allclose(ipu_res, ref_ipu_res)
    torch.allclose(ipu_loss, ref_ipu_loss)


================================================
FILE: tests/gnn/nn/test_sequential.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

from collections import OrderedDict
from torch.nn import ReLU
from torch_geometric.nn import Sequential, GCNConv, Linear

from conv.conv_utils import conv_harness

conv_kwargs = {"add_self_loops": False}


def test_sequential(dataset):
    out_channels = in_channels = dataset.num_node_features

    model = Sequential('x, edge_index', [
        (GCNConv(in_channels, 64, **conv_kwargs), 'x, edge_index -> x'),
        ReLU(inplace=True),
        (GCNConv(64, 64, **conv_kwargs), 'x, edge_index -> x'),
        ReLU(inplace=True),
        Linear(64, out_channels),
    ])

    conv_harness(model, dataset)


def test_sequential_with_ordered_dict(dataset):
    in_channels = dataset.num_node_features

    model = Sequential('x, edge_index',
                       modules=OrderedDict([
                           ('conv1', (GCNConv(in_channels, 32, **conv_kwargs),
                                      'x, edge_index -> x')),
                           ('conv2', (GCNConv(32, 64, **conv_kwargs),
                                      'x, edge_index -> x')),
                       ]))

    conv_harness(model, dataset)


================================================
FILE: tests/gnn/nn/unpool/test_interpolate.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import helpers

import torch
import torch_geometric

import poptorch
import poptorch_geometric  # pylint: disable=unused-import


def test_knn_interpolate():
    x = torch.Tensor([[1], [10], [100], [-1], [-10], [-100]])
    pos_x = torch.Tensor([[-1, 0], [0, 0], [1, 0], [-2, 0], [0, 0], [2, 0]])
    pos_y = torch.Tensor([[-1, -1], [1, 1], [-2, -2], [2, 2]])
    batch_x = torch.tensor([0, 0, 0, 1, 1, 1])
    batch_y = torch.tensor([0, 0, 1, 1])
    k = 2

    class Model(torch.nn.Module):
        def forward(self, *args, **kwargs):
            return torch_geometric.nn.knn_interpolate(*args, **kwargs)

    model = poptorch.inferenceModel(Model())

    poptorch_out = model(x, pos_x, pos_y, batch_x, batch_y, k)
    torch_geometric_out = torch_geometric.nn.knn_interpolate(
        x, pos_x, pos_y, batch_x, batch_y, k)

    helpers.assert_allclose(actual=poptorch_out, expected=torch_geometric_out)


================================================
FILE: tests/gnn/ops/test_knn.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import pytest
import torch
import torch_cluster

import helpers
from poptorch_geometric.ops.knn import knn
import poptorch


def assert_fn(native_out, poptorch_out, x, y):
    row_native, col_native = native_out
    row_poptorch, col_poptorch = poptorch_out

    helpers.assert_allclose(actual=row_poptorch, expected=row_native)
    assert col_native.shape == col_poptorch.shape

    for idx, expected_idx, y_idx in zip(col_native, col_poptorch, row_native):
        if idx != expected_idx:
            helpers.assert_allclose(actual=torch.norm(x[idx] - y[y_idx],
                                                      dim=-1),
                                    expected=torch.norm(x[expected_idx] -
                                                        y[y_idx],
                                                        dim=-1))


def op_harness(op, reference_op, x, y, k, batch_x=None, batch_y=None):

    native_out = reference_op(x, y, k, batch_x, batch_y)

    class Model(torch.nn.Module):
        def forward(self, *args):
            return op(*args)

    model = poptorch.inferenceModel(Model())

    poptorch_out = model(x, y, k, batch_x, batch_y)

    assert_fn(native_out, poptorch_out, x, y)


@pytest.mark.parametrize("with_batch", [True, False])
def test_knn_basic(with_batch):
    pos_x = torch.Tensor([[-1, 0], [0, 0], [1, 0], [-2, 0], [0, 0], [2, 0]])
    pos_y = torch.Tensor([[-1, -1], [1, 1], [-2, -2], [2, 2]])
    k = 2
    if with_batch:
        batch_x = torch.Tensor([0, 0, 0, 1, 1, 1])
        batch_y = torch.Tensor([0, 0, 1, 1])
    else:
        batch_x = None
        batch_y = None

    op_harness(knn, knn, pos_x, pos_y, k, batch_x, batch_y)
    op_harness(knn, torch_cluster.knn, pos_x, pos_y, k, batch_x, batch_y)


def test_knn():
    x = torch.Tensor([
        [-1, -1],
        [-1, +1],
        [+1, +1],
        [+1, -1],
        [-1, -1],
        [-1, +1],
        [+1, +1],
        [+1, -1],
    ])
    y = torch.Tensor([
        [1, 0],
        [-1, 0],
    ])

    batch_x = torch.Tensor([0, 0, 0, 0, 1, 1, 1, 1])
    batch_y = torch.Tensor([0, 1])
    k = 2

    op_harness(knn, torch_cluster.knn, x, y, k, batch_x, batch_y)
    op_harness(knn, knn, x, y, k, batch_x, batch_y)
    op_harness(knn, torch_cluster.knn, x, y, k)
    op_harness(knn, knn, x, y, k)


def test_knn_batch_skip():
    x = torch.Tensor([
        [-1, -1],
        [-1, +1],
        [+1, +1],
        [+1, -1],
        [-1, -1],
        [-1, +1],
        [+1, +1],
        [+1, -1],
    ])
    y = torch.Tensor([
        [1, 0],
        [-1, 0],
    ])

    batch_x = torch.Tensor([0, 0, 0, 0, 1, 1, 1, 1])
    batch_y = torch.Tensor([0, 1])
    k = 2

    op_harness(knn, torch_cluster.knn, x, y, k, batch_x, batch_y)
    op_harness(knn, knn, x, y, k, batch_x, batch_y)


@pytest.mark.parametrize("with_batch", [True, False])
def test_knn_override(with_batch):
    pos_x = torch.Tensor([[-1, 0], [0, 0], [1, 0], [-2, 0], [0, 0], [2, 0]])
    pos_y = torch.Tensor([[-1, -1], [1, 1], [-2, -2], [2, 2]])
    k = 2
    if with_batch:
        batch_x = torch.Tensor([0, 0, 0, 1, 1, 1])
        batch_y = torch.Tensor([0, 0, 1, 1])
    else:
        batch_x = None
        batch_y = None

    class Model(torch.nn.Module):
        def forward(self, *args):
            return torch_cluster.knn(*args)

    model = poptorch.inferenceModel(Model())
    poptorch_out = model(pos_x, pos_y, k, batch_x, batch_y)
    native_out = torch_cluster.knn(pos_x, pos_y, k, batch_x, batch_y)
    assert_fn(native_out, poptorch_out, pos_x, pos_y)


================================================
FILE: tests/gnn/ops/test_knn_graph.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import pytest
import torch
import helpers

from torch_geometric.nn import knn_graph
from poptorch_geometric.ops.knn_graph import knn_graph as pyg_knn_graph

import poptorch


@pytest.mark.parametrize('flow', ['source_to_target', 'target_to_source'])
def test_knn_graph(flow):
    x = torch.Tensor([[1], [10], [100], [-1], [-10], [-100]])
    batch = torch.tensor([0, 0, 0, 1, 1, 1])
    k = 2

    class Model(torch.nn.Module):
        def forward(self, *args, **kwargs):
            return pyg_knn_graph(*args, **kwargs)

    model = poptorch.inferenceModel(Model())

    poptorch_out = model(x, k, batch, True, flow)
    torch_geometric_out = knn_graph(x, k, batch, True, flow)
    pyg_cpu_out = pyg_knn_graph(x, k, batch, True, flow)

    helpers.assert_allclose(actual=poptorch_out, expected=pyg_cpu_out)
    helpers.assert_allclose(actual=poptorch_out, expected=torch_geometric_out)


================================================
FILE: tests/gnn/ops/test_knn_interpolate.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import helpers

import torch
from torch_geometric.nn import knn_interpolate

import poptorch

from poptorch_geometric.ops.knn_interpolate import knn_interpolate as pyg_knn_interpolate


def test_knn_interpolate():
    x = torch.Tensor([[1], [10], [100], [-1], [-10], [-100]])
    pos_x = torch.Tensor([[-1, 0], [0, 0], [1, 0], [-2, 0], [0, 0], [2, 0]])
    pos_y = torch.Tensor([[-1, -1], [1, 1], [-2, -2], [2, 2]])
    batch_x = torch.tensor([0, 0, 0, 1, 1, 1])
    batch_y = torch.tensor([0, 0, 1, 1])
    k = 2

    class Model(torch.nn.Module):
        def forward(self, *args, **kwargs):
            return pyg_knn_interpolate(*args, **kwargs)

    model = poptorch.inferenceModel(Model())

    poptorch_out = model(x, pos_x, pos_y, batch_x, batch_y, k)
    torch_geometric_out = knn_interpolate(x, pos_x, pos_y, batch_x, batch_y, k)
    pyg_cpu_out = pyg_knn_interpolate(x, pos_x, pos_y, batch_x, batch_y, k)

    helpers.assert_allclose(actual=poptorch_out, expected=pyg_cpu_out)
    helpers.assert_allclose(actual=poptorch_out, expected=torch_geometric_out)


================================================
FILE: tests/gnn/ops/test_nearest.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import pytest
import torch

from torch_cluster import nearest as nearest_or
from poptorch import nearest
import poptorch


def op_harness(op, reference_op, x, y, batch_x=None, batch_y=None):
    batch_x_ref = torch.tensor(batch_x, dtype=torch.long) if isinstance(
        batch_x, list) else batch_x
    batch_y_ref = torch.tensor(batch_y, dtype=torch.long) if isinstance(
        batch_y, list) else batch_y
    native_out = reference_op(x, y, batch_x_ref, batch_y_ref)

    class Model(torch.nn.Module):
        def forward(self, *args):
            return op(*args)

    model = poptorch.inferenceModel(Model())
    poptorch_out = model(x, y, batch_x, batch_y)

    assert all(native_out == poptorch_out)


@pytest.mark.parametrize('dtype', [torch.half, torch.float, torch.double])
def test_nearest(dtype):
    x = torch.tensor([
        [-1, -1],
        [-1, +1],
        [+1, +1],
        [+1, -1],
        [-2, -2],
        [-2, +2],
        [+2, +2],
        [+2, -2],
    ],
                     dtype=dtype)
    y = torch.tensor([
        [-1, 0],
        [+1, 0],
        [-2, 0],
        [+2, 0],
    ], dtype=dtype)

    batch_x_lst = [0, 0, 0, 0, 1, 1, 1, 1]
    batch_x = torch.tensor(batch_x_lst, dtype=torch.long)
    batch_y_lst = [0, 0, 1, 1]
    batch_y = torch.tensor(batch_y_lst, dtype=torch.long)
    op_harness(nearest, nearest_or, x, y, batch_x_lst, batch_y_lst)
    op_harness(nearest, nearest_or, x, y, batch_x, batch_y)

    batch_x_lst_zeros = [0] * x.shape[0]
    batch_x_zeros = torch.tensor(batch_x_lst_zeros, dtype=torch.long)
    batch_y_lst_zeros = [0] * y.shape[0]
    batch_y_zeros = torch.tensor(batch_y_lst_zeros, dtype=torch.long)
    op_harness(nearest, nearest_or, x, y, batch_x=batch_x_zeros)
    op_harness(nearest, nearest_or, x, y, batch_y=batch_y_zeros)

    op_harness(nearest, nearest_or, x, y)

    # Invalid input: instance 1 only in batch_x
    batch_x = [0, 0, 0, 0, 1, 1, 1, 1]
    batch_y = [0, 0, 0, 0]
    with pytest.raises(ValueError):
        op_harness(nearest, nearest_or, x, y, batch_x, batch_y)

    # Invalid input: instance 1 only in batch_x (implicitly as batch_y=None)
    with pytest.raises(ValueError):
        op_harness(nearest, nearest_or, x, y, batch_x, None)

    # Invalid input: instance 2 only in batch_x
    # (i.e.instance in the middle missing)
    batch_x = [0, 0, 1, 1, 2, 2, 3, 3]
    batch_y = [0, 1, 3, 3]
    with pytest.raises(ValueError):
        op_harness(nearest, nearest_or, x, y, batch_x, batch_y)

    # Invalid input: batch_x unsorted
    batch_x = [0, 0, 1, 0, 0, 0, 0]
    batch_y = [0, 0, 1, 1]
    with pytest.raises(ValueError):
        op_harness(nearest, nearest_or, x, y, batch_x, batch_y)

    # Invalid input: batch_y unsorted
    batch_x = [0, 0, 0, 0, 1, 1, 1, 1]
    batch_y = [0, 0, 1, 0]
    with pytest.raises(ValueError):
        op_harness(nearest, nearest_or, x, y, batch_x, batch_y)


================================================
FILE: tests/gnn/ops/test_radius_op.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import pytest
import torch
import torch_cluster

from poptorch_geometric.ops.radius import radius, radius_graph
import poptorch


def to_set(edge_index):
    # pylint: disable=R1721
    return {(i, j) for i, j in edge_index.t().tolist()}


def assert_fn(native_out, poptorch_out):
    poptorch_out = poptorch_out[poptorch_out != -1]
    dim = poptorch_out.size(0) // 2
    poptorch_out = poptorch_out.reshape((2, dim))

    native_out = native_out[native_out != -1]
    dim = native_out.size(0) // 2
    native_out = native_out.reshape((2, dim))

    assert to_set(poptorch_out) == to_set(native_out)


def op_harness(op, reference_op, *args, **kwargs):

    native_out = reference_op(*args, **kwargs)

    class Model(torch.nn.Module):
        def forward(self, *args, **kwargs):
            return op(*args, **kwargs)

    model = poptorch.inferenceModel(Model())

    poptorch_out = model(*args, **kwargs)

    assert_fn(native_out, poptorch_out)


@pytest.mark.parametrize("with_batch", [True, False])
def test_radius_basic(with_batch):
    x = torch.Tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]])
    y = torch.Tensor([[-1, 0], [1, 0]])

    if with_batch:
        batch_x = torch.tensor([0, 0, 0, 1])
        batch_y = torch.tensor([0, 1])
    else:
        batch_x = None
        batch_y = None

    op_harness(radius, torch_cluster.radius, x, y, 1.5, batch_x, batch_y)


def test_radius_upstream():
    x = torch.tensor([
        [-1, -1],
        [-1, +1],
        [+1, +1],
        [+1, -1],
        [-1, -1],
        [-1, +1],
        [+1, +1],
        [+1, -10],
    ])
    y = torch.tensor([
        [0, 0],
        [0, 1],
    ])

    batch_x = torch.tensor([0, 0, 0, 0, 1, 1, 1, 1], dtype=torch.long)
    batch_y = torch.tensor([0, 1], dtype=torch.long)

    op_harness(radius, torch_cluster.radius, x, y, 2, max_num_neighbors=4)
    op_harness(radius,
               torch_cluster.radius,
               x,
               y,
               2,
               batch_x,
               batch_y,
               max_num_neighbors=4)

    # Skipping a batch
    batch_x = torch.tensor([0, 0, 0, 0, 2, 2, 2, 2], dtype=torch.long)
    batch_y = torch.tensor([0, 2], dtype=torch.long)
    op_harness(radius,
               torch_cluster.radius,
               x,
               y,
               2,
               batch_x,
               batch_y,
               max_num_neighbors=4)


@pytest.mark.parametrize('flow', ['source_to_target', 'target_to_source'])
def test_radius_graph(flow):
    x = torch.tensor([
        [-1, -1],
        [-1, +1],
        [+1, +1],
        [+1, -1],
    ])

    op_harness(radius_graph,
               torch_cluster.radius_graph,
               x,
               r=2.5,
               loop=True,
               flow=flow)


@pytest.mark.ipuHardwareRequired
def test_radius_graph_large():
    torch.manual_seed(40)
    x = torch.randn(1000, 3)

    op_harness(radius_graph,
               torch_cluster.radius_graph,
               x,
               r=2.5,
               loop=True,
               flow='target_to_source',
               max_num_neighbors=2000)


================================================
FILE: tests/gnn/ops/test_spline_conv_ops.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

# Tests for PyG torch_spline_conv ops integration with PopTorch
from collections import namedtuple
from copy import deepcopy
import torch
import pytest
import helpers
import poptorch

if helpers.is_running_tests:
    from torch_spline_conv import spline_basis, spline_weighting
else:

    def spline_basis():
        pass

    def spline_weighting():
        pass


def gen_basis_input_data(num_edges, num_dims, max_kernel_size, dtype):
    torch.manual_seed(0)
    pseudo = torch.rand(num_edges, num_dims, dtype=dtype)
    kernel_size = torch.randint(1, max_kernel_size, (num_dims, ))
    is_open_spline = torch.randint(0, 2, (num_dims, ), dtype=torch.uint8)
    return pseudo, kernel_size, is_open_spline


BasisParams = namedtuple('BasisParams', 'edges dims max_kernel_size degree')
test_params_b = (BasisParams(6, 2, 6, 1), BasisParams(64, 3, 16, 3))


@pytest.mark.parametrize("params", test_params_b)
@pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
def test_spline_basis(params, dtype):
    class Model(torch.nn.Module):
        def __init__(self, degree):
            self.degree = degree
            super().__init__()

        def forward(self, p, ks, ios):
            return spline_basis(p, ks, ios, self.degree)

    *params, degree = params
    pseudo, kernel_size, is_open_spline = gen_basis_input_data(*params, dtype)

    model = Model(degree)
    pseudo_f32 = pseudo.type(torch.float32)
    basis, weight_index = model(pseudo_f32, kernel_size, is_open_spline)
    reference_output = (basis.type(dtype), weight_index)

    poptorch_model = poptorch.inferenceModel(deepcopy(model))
    poptorch_output = poptorch_model(pseudo, kernel_size, is_open_spline)

    atol, rtol = (1e-3, 1e-5) if dtype == torch.float16 else (1e-5, 1e-8)
    helpers.assert_allclose(actual=poptorch_output,
                            expected=reference_output,
                            atol=atol,
                            rtol=rtol)


def gen_weighting_input_data(edges, in_ch, out_ch, kernel_size, num_splines,
                             dtype):
    torch.manual_seed(0)
    x = torch.rand(edges, in_ch, dtype=dtype)
    weights = torch.rand(kernel_size, in_ch, out_ch, dtype=dtype)
    basis = torch.rand(edges, num_splines, dtype=dtype)
    weight_index = torch.randint(0, kernel_size, (edges, num_splines))
    return x, weights, basis, weight_index


WeightingParams = namedtuple('WeightingParams',
                             'edges in_ch out_ch kernel_size num_splines')
test_params_w = (WeightingParams(6, 4, 4, 10,
                                 8), WeightingParams(24, 5, 6, 3, 10))


@pytest.mark.parametrize("params", test_params_w)
@pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
def test_spline_weighting(params, dtype):
    class Model(torch.nn.Module):
        def forward(self, x, weight, basis, weight_index):
            return spline_weighting(x, weight, basis, weight_index)

    x, weight, basis, weight_index = gen_weighting_input_data(*params, dtype)

    model = Model()
    x_f32 = x.type(torch.float32)
    weight_f32 = weight.type(torch.float32)
    basis_f32 = basis.type(torch.float32)
    reference_output = model(x_f32, weight_f32, basis_f32, weight_index)

    poptorch_model = poptorch.inferenceModel(deepcopy(model))
    weight_index = weight_index.type(torch.int32)
    poptorch_output = poptorch_model(x, weight, basis, weight_index)

    atol, rtol = (1e-2, 1e-3) if dtype == torch.float16 else (1e-5, 1e-8)
    helpers.assert_allclose(actual=poptorch_output,
                            expected=reference_output.type(dtype),
                            atol=atol,
                            rtol=rtol)


================================================
FILE: tests/gnn/ops/test_to_dense_batch.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import pytest
import torch
import torch_geometric
from torch_geometric.utils import to_dense_batch

import helpers
import poptorch


def op_harness(reference_op, *args, **kwargs):
    class Model(torch.nn.Module):
        def forward(self, *args, **kwargs):
            return torch_geometric.utils.to_dense_batch(*args, **kwargs)

    model = poptorch.inferenceModel(Model())

    poptorch_out = model(*args, **kwargs)

    native_out = reference_op(*args, **kwargs)

    helpers.assert_allclose(actual=poptorch_out, expected=native_out)


def test_basic():
    x = torch.arange(12).view(6, 2)

    op_harness(to_dense_batch, x, batch_size=1, max_num_nodes=11)


def test_batch_size_not_set():
    x = torch.arange(12).view(6, 2)
    batch = torch.tensor([0, 0, 1, 2, 2, 2])

    with pytest.raises(
            ValueError,
            match=
            "Dynamic shapes disabled. Argument 'batch_size' needs to be set"):
        op_harness(to_dense_batch, x, batch)


def test_batch_size_set():
    x = torch.arange(12).view(6, 2)
    batch = torch.tensor([0, 0, 1, 2, 2, 2])

    with pytest.raises(
            ValueError,
            match=
            "Dynamic shapes disabled. Argument 'max_num_nodes' needs to be set"
    ):
        op_harness(to_dense_batch, x, batch, batch_size=3)


def test_batch_size_and_max_num_nodes_set():
    x = torch.arange(12).view(6, 2)
    batch = torch.tensor([0, 0, 1, 2, 2, 2])
    batch_size = int(batch.max()) + 1
    max_num_nodes = 11

    op_harness(to_dense_batch,
               x,
               batch,
               max_num_nodes=max_num_nodes,
               batch_size=batch_size)


================================================
FILE: tests/gnn/test_basic_gnn.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import unittest.mock

import pytest
import torch
import torch.nn.functional as F
from torch_geometric import seed_everything
from torch_geometric.datasets import FakeDataset
from torch_geometric.nn.models import GAT, GCN, GIN, PNA, EdgeCNN, GraphSAGE
from torch_geometric.transforms import Compose, GCNNorm, NormalizeFeatures
from torch_geometric.utils import degree
from torch_scatter import scatter_add

import helpers
from poptorch_geometric import TrainingStepper, set_aggregation_dim_size


@pytest.fixture
def data():
    seed_everything(0)
    transform = Compose([GCNNorm(), NormalizeFeatures()])
    dataset = FakeDataset(transform=transform, num_channels=64)
    data = dataset[0]
    data.num_classes = dataset.num_classes

    # Add a train_mask property that contains indices
    num_training_nodes = int(0.8 * data.num_nodes)
    data.train_mask = torch.randperm(data.num_nodes)[:num_training_nodes]
    return data


def node_classification_harness(gnn,
                                dataset,
                                num_steps=40,
                                atol=1e-4,
                                rtol=1e-5):
    # Wrapper for a GNN model + a loss function
    class Wrapper(torch.nn.Module):
        def __init__(self, model, loss_fn):
            super().__init__()
            self.model = model
            self.loss_fn = loss_fn

        def forward(self, x, edge_index, train_mask, y):
            x = self.model(x, edge_index)
            out = F.log_softmax(x, dim=1)
            pred = out[train_mask]
            target = y[train_mask]
            loss = self.loss_fn(pred, target)
            return out, loss

    set_aggregation_dim_size(gnn, int(dataset.edge_index.max()) + 1)
    model = Wrapper(gnn, F.cross_entropy)
    stepper = TrainingStepper(model, atol=atol, rtol=rtol)
    batch = (dataset.x, dataset.edge_index, dataset.train_mask, dataset.y)
    stepper.run(num_steps, batch)


@unittest.mock.patch.dict("os.environ", helpers.disableSmallModel())
def test_node_classification_GCN(data):
    gnn = GCN(in_channels=data.num_node_features,
              hidden_channels=32,
              num_layers=2,
              out_channels=data.num_classes,
              normalize=False)

    node_classification_harness(gnn, data)


@unittest.mock.patch.dict("os.environ", helpers.disableSmallModel())
def test_node_classification_GraphSAGE(data):
    gnn = GraphSAGE(in_channels=data.num_node_features,
                    hidden_channels=32,
                    num_layers=2,
                    out_channels=data.num_classes)

    node_classification_harness(gnn, data, atol=1e-3, rtol=1e-2)


@unittest.mock.patch.dict("os.environ", helpers.disableSmallModel())
def test_node_classification_GIN(data):
    gnn = GIN(in_channels=data.num_node_features,
              hidden_channels=32,
              num_layers=2,
              out_channels=data.num_classes)

    node_classification_harness(gnn, data)


@unittest.mock.patch.dict("os.environ", helpers.disableSmallModel())
def test_node_classification_GAT(data):
    gnn = GAT(in_channels=data.num_node_features,
              hidden_channels=32,
              num_layers=2,
              out_channels=data.num_classes,
              add_self_loops=False)

    node_classification_harness(gnn, data)


@unittest.mock.patch.dict("os.environ", helpers.disableSmallModel())
def test_node_classification_PNA(data):
    # Calculate the in-degree histogram
    deg = degree(data.edge_index[1]).long()
    deg = scatter_add(torch.ones_like(deg), deg)

    gnn = PNA(in_channels=data.num_node_features,
              hidden_channels=32,
              num_layers=2,
              out_channels=data.num_classes,
              aggregators=['sum', 'mean'],
              scalers=['linear'],
              deg=deg)

    # TODO: investigate numerical drift with PNAConv
    node_classification_harness(gnn, data, num_steps=1)


@unittest.mock.patch.dict("os.environ", helpers.disableSmallModel())
@pytest.mark.parametrize('act', [torch.nn.ReLU(), torch.relu_])
def test_node_classification_EdgeCNN(data, act):
    if act == torch.relu_:
        # TODO: enable testing with the inplace relu_ op when this is supported
        pytest.skip(
            "Skipping testing inplace activation with dispatcher: "
            "RuntimeError: a leaf Variable that requires grad is being used in"
            "an in-place operation.")

    gnn = EdgeCNN(in_channels=data.num_node_features,
                  hidden_channels=32,
                  num_layers=2,
                  out_channels=data.num_classes,
                  dropout=0,
                  act=act,
                  norm=None,
                  jk=None)

    node_classification_harness(gnn, data, num_steps=1)


================================================
FILE: tests/gnn/test_cluster_loader.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import pytest
from torch_geometric import seed_everything
from torch_geometric.datasets import FakeDataset
from torch_geometric.loader import ClusterData

from poptorch_geometric.cluster_loader import \
    FixedSizeClusterLoader as IPUFixedSizeClusterLoader
from poptorch_geometric.fixed_size_options import FixedSizeOptions
from poptorch_geometric.pyg_cluster_loader import FixedSizeClusterLoader
import poptorch


@pytest.mark.parametrize('loader_cls',
                         [FixedSizeClusterLoader, IPUFixedSizeClusterLoader])
@pytest.mark.parametrize('batch_size', [1, 2, 4])
@pytest.mark.parametrize('task', ['graph', 'node'])
def test_fixed_size_dataloader_with_cluster_data(loader_cls, batch_size,
                                                 benchmark, task):
    ipu_dataloader = loader_cls is IPUFixedSizeClusterLoader

    avg_degree = 3
    num_parts = 8
    seed_everything(42)

    dataset = FakeDataset(
        num_graphs=1,
        avg_num_nodes=128,
        avg_degree=avg_degree,
        num_channels=4,
        task=task,
    )[0]

    # Get a sensible value for the the maximum number of nodes.
    padded_num_nodes = dataset.num_nodes // num_parts * batch_size + 10
    padded_num_edges = (avg_degree + 5) * padded_num_nodes

    cluster_data = ClusterData(dataset, num_parts=num_parts, log=False)

    # Define the expected tensor sizes in the output.
    data = cluster_data.data
    data_attributes = (k for k, _ in data
                       if data.is_node_attr(k) or data.is_edge_attr(k))

    expected_sizes = {
        k:
        ((padded_num_nodes if data.is_node_attr(k) else padded_num_edges), 0)
        for k in data_attributes
    }
    # Special case for edge_index which is of shape [2, num_edges].
    expected_sizes['edge_index'] = (padded_num_edges, 1)

    # Special case for `y` being graph-lvl label
    if not data.is_node_attr('y'):
        expected_sizes['y'] = (2, 0)

    # Create a fixed size dataloader.
    kwargs = {
        'cluster_data':
        cluster_data,
        'fixed_size_options':
        FixedSizeOptions(num_nodes=padded_num_nodes,
                         num_edges=padded_num_edges,
                         num_graphs=2),
        'batch_size':
        batch_size
    }
    if ipu_dataloader:
        kwargs['options'] = poptorch.Options()

    loader = loader_cls(**kwargs)

    # Check that each batch matches the expected size.
    for batch in loader:
        sizes_match = all(
            getattr(batch, k).shape[dim] == size
            for k, (size, dim) in expected_sizes.items())
        assert sizes_match

    def loop():
        for _ in loader:
            pass

    benchmark(loop)


================================================
FILE: tests/gnn/test_collate.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import pytest

import torch

import torch_geometric as pyg
from torch_geometric.data import Data, HeteroData
from torch.utils.data import BatchSampler, RandomSampler, SequentialSampler

from utils import is_data
from poptorch_geometric.collate import FixedSizeCollater
from poptorch_geometric.fixed_size_options import FixedSizeOptions

# pylint: disable=protected-access


@pytest.fixture(params=[Data, HeteroData])
def _get_test_data(request, molecule, fake_hetero_data):
    if is_data(request.param):
        dataset = molecule
        assert dataset.num_nodes == 29
        assert dataset.num_edges == 56
    else:
        dataset = fake_hetero_data
        dataset['name'] = 'gdb_57518'
        assert dataset.num_nodes == 103
        assert dataset.num_edges == 2391
    return request.param, dataset


@pytest.mark.parametrize('num_graphs,num_real_graphs', [(10, 8), (2, 1)])
@pytest.mark.parametrize('num_edges', [300, None])
@pytest.mark.parametrize('set_pad_values', [True, False])
def test_batch_masks(num_graphs, num_real_graphs, num_edges, set_pad_values):
    avg_num_nodes = 10
    num_channels = 8

    dataset = pyg.datasets.FakeDataset(num_graphs=16,
                                       avg_num_nodes=avg_num_nodes,
                                       avg_degree=2,
                                       num_channels=num_channels,
                                       edge_dim=2,
                                       task='graph')

    node_pad_value = 22.0 if set_pad_values else 0.0
    edge_pad_value = 34.0 if set_pad_values else 0.0
    graph_pad_value = 55.0 if set_pad_values else 0.0

    num_batch_nodes = 100
    num_batch_edges = num_batch_nodes * (num_batch_nodes - 1) \
         if num_edges is None else num_edges
    num_batch_graphs = num_graphs

    fixed_size_options = None
    fixed_size_collater = None
    if set_pad_values:
        fixed_size_options = FixedSizeOptions(num_nodes=num_batch_nodes,
                                              num_edges=num_edges,
                                              num_graphs=num_graphs,
                                              node_pad_value=node_pad_value,
                                              edge_pad_value=edge_pad_value,
                                              graph_pad_value=graph_pad_value)
        fixed_size_collater = FixedSizeCollater(
            fixed_size_options=fixed_size_options, add_masks_to_batch=True)
    else:
        fixed_size_options = FixedSizeOptions(
            num_nodes=num_batch_nodes,
            num_edges=num_edges,
            num_graphs=num_graphs,
        )
        fixed_size_collater = FixedSizeCollater(
            fixed_size_options=fixed_size_options, add_masks_to_batch=True)

    batch_sampler = BatchSampler(SequentialSampler(dataset),
                                 num_real_graphs,
                                 drop_last=False)

    for i, sample in enumerate(batch_sampler):
        num_real_nodes = sum(dataset[id].num_nodes for id in sample)
        num_real_edges = sum(dataset[id].num_edges for id in sample)
        result = fixed_size_collater([dataset[id] for id in sample])

        # Check graph values
        assert len(result.graphs_mask) == num_batch_graphs
        assert int(result.graphs_mask.sum()) == num_real_graphs

        for j, mask in enumerate(result.graphs_mask):
            if mask.item() is True:
                assert dataset[i * num_real_graphs + j].y[0] == result.y[j]
            else:
                assert result.y[j] == graph_pad_value

        # Check nodes values
        assert len(result.nodes_mask) == num_batch_nodes
        assert int(result.nodes_mask.sum()) == num_real_nodes

        begin = 0
        end = 0
        for id in sample:
            end += dataset[id].num_nodes
            assert torch.all(result.nodes_mask[begin:end])
            assert torch.equal(result.x[begin:end], dataset[id].x)
            begin += dataset[id].num_nodes

        assert not torch.any(result.nodes_mask[begin:])
        for node_features in result.x[begin:]:
            for feature in node_features:
                assert feature == node_pad_value

        # Check edges values
        assert len(result.edges_mask) == num_batch_edges
        assert int(result.edges_mask.sum()) == num_real_edges

        begin = 0
        end = 0
        for id in sample:
            end += dataset[id].num_edges
            assert torch.all(result.edges_mask[begin:end])
            assert torch.equal(result.edge_attr[begin:end],
                               dataset[id].edge_attr)
            begin += dataset[id].num_edges

        assert not torch.any(result.edges_mask[begin:])
        for edge_features in result.edge_attr[begin:]:
            for feature in edge_features:
                assert feature == edge_pad_value


@pytest.mark.parametrize('num_graphs,num_real_graphs', [(6, 2), (4, 2),
                                                        (2, 1)])
@pytest.mark.parametrize('num_edges', [1200, None])
@pytest.mark.parametrize('set_pad_values', [True, False])
def test_batch_masks_heterodata(num_graphs, num_real_graphs, num_edges,
                                set_pad_values, fake_hetero_dataset):

    dataset = fake_hetero_dataset
    num_node_types = 2
    num_edge_types = 5

    node_pad_value = 22.0 if set_pad_values else 0.0
    edge_pad_value = 34.0 if set_pad_values else 0.0
    graph_pad_value = 55.0 if set_pad_values else 0.0

    num_batch_nodes = 150

    fixed_size_options = None
    fixed_size_collater = None
    if set_pad_values:
        fixed_size_options = FixedSizeOptions(num_nodes=num_batch_nodes,
                                              num_edges=num_edges,
                                              num_graphs=num_graphs,
                                              node_pad_value=node_pad_value,
                                              edge_pad_value=edge_pad_value,
                                              graph_pad_value=graph_pad_value)
        fixed_size_collater = FixedSizeCollater(
            fixed_size_options=fixed_size_options, add_masks_to_batch=True)
    else:
        fixed_size_options = FixedSizeOptions(
            num_nodes=num_batch_nodes,
            num_edges=num_edges,
            num_graphs=num_graphs,
        )
        fixed_size_collater = FixedSizeCollater(
            fixed_size_options=fixed_size_options, add_masks_to_batch=True)

    num_batch_edges = (num_batch_nodes * (num_batch_nodes - 1) \
         if num_edges is None else num_edges) * num_edge_types
    num_batch_graphs = num_graphs
    num_batch_nodes *= num_node_types

    batch_sampler = BatchSampler(SequentialSampler(dataset),
                                 num_real_graphs,
                                 drop_last=False)

    for i, sample in enumerate(batch_sampler):
        num_real_nodes = sum(dataset[id].num_nodes for id in sample)
        num_real_edges = sum(dataset[id].num_edges for id in sample)
        result = fixed_size_collater([dataset[id] for id in sample])

        # Check graph values
        assert len(result.graphs_mask) == num_batch_graphs
        assert int(result.graphs_mask.sum()) == num_real_graphs

        for j, mask in enumerate(result.graphs_mask):
            if mask.item() is True:
                assert dataset[i * num_real_graphs + j].y[0] == result.y[j]
            else:
                assert result.y[j] == graph_pad_value

        # Check nodes values
        assert sum(node_type.nodes_mask.shape[0]
                   for node_type in result.node_stores) == num_batch_nodes

        for key in result.node_types:
            num_real_nodes = sum(
                dataset[id]._node_store_dict[key]['x'].shape[0]
                for id in sample)
            nodes_mask = result._node_store_dict[key]['nodes_mask']
            assert torch.all(nodes_mask[0:num_real_nodes])
            assert not torch.any(nodes_mask[num_real_nodes:])
            x = result._node_store_dict[key]['x']
            assert not torch.all(x[num_real_nodes:] - node_pad_value)

        # Check edges values
        assert sum(edge_type.edges_mask.shape[0]
                   for edge_type in result.edge_stores) == num_batch_edges

        for key in result.edge_types:
            num_real_edges = sum(
                dataset[id]._edge_store_dict[key]['edge_index'].shape[1]
                for id in sample)
            edges_mask = result._edge_store_dict[key]['edges_mask']
            assert torch.all(edges_mask[0:num_real_edges])
            assert not torch.any(edges_mask[num_real_edges:])


def test_prune_nodes_single_input(_get_test_data):
    type_, dataset = _get_test_data

    if is_data(type_):
        fixed_size_options = FixedSizeOptions(num_nodes=10, num_graphs=2)
    else:
        fixed_size_options = FixedSizeOptions(num_nodes=dict(v0=10, v1=5),
                                              num_graphs=2)
        fixed_size_options.to_hetero(dataset.node_types, dataset.edge_types)

    fixed_size_collater = FixedSizeCollater(fixed_size_options)
    result = fixed_size_collater._prune_nodes([dataset])
    assert len(result) == 1

    if is_data(type_):
        assert result[0].num_nodes == fixed_size_options.num_nodes
        assert result[0].x.shape[0] == fixed_size_options.num_nodes
        assert result[0].pos.shape[0] == fixed_size_options.num_nodes
    else:
        assert result[0].num_nodes == fixed_size_options.total_num_nodes
        for node_type, expected_val in fixed_size_options.num_nodes.items():
            assert result[0][node_type].num_nodes == expected_val
            assert result[0][node_type].x.shape[0] == expected_val


def test_prune_nodes_multiple_inputs(_get_test_data):
    type_, dataset = _get_test_data

    num_inputs = 4
    input = [dataset] * num_inputs
    if is_data(type_):
        fixed_size_options = FixedSizeOptions(num_nodes=80,
                                              num_graphs=num_inputs + 1)
    else:
        fixed_size_options = FixedSizeOptions(num_nodes=dict(v0=80, v1=40),
                                              num_graphs=num_inputs + 1)
        fixed_size_options.to_hetero(dataset.node_types, dataset.edge_types)

    fixed_size_collater = FixedSizeCollater(fixed_size_options)
    result = fixed_size_collater._prune_nodes(input)
    num_nodes = 0
    for data in result:
        num_nodes += data.num_nodes
        assert num_nodes > 0

    assert num_nodes == fixed_size_options.total_num_nodes


def test_prune_nodes_multiple_inputs_minimal_num_node(_get_test_data):
    type_, dataset = _get_test_data

    num_inputs = 3
    input = [dataset] * num_inputs

    if is_data(type_):
        fixed_size_options = FixedSizeOptions(num_nodes=3,
                                              num_graphs=num_inputs + 1)
    else:
        fixed_size_options = FixedSizeOptions(num_nodes=dict(v0=3, v1=3),
                                              num_graphs=num_inputs + 1)
        fixed_size_options.to_hetero(dataset.node_types, dataset.edge_types)

    fixed_size_collater = FixedSizeCollater(fixed_size_options)

    result = fixed_size_collater._prune_nodes(input)
    assert len(result) == num_inputs

    num_nodes = 0
    for data in result:
        num_nodes += data.num_nodes
        assert data.num_nodes > 0

    assert num_nodes == fixed_size_options.total_num_nodes


def test_prune_edges_single_input(_get_test_data):
    type_, dataset = _get_test_data

    if is_data(type_):
        fixed_size_options = FixedSizeOptions(num_nodes=dataset.num_nodes,
                                              num_edges=40)
    else:
        fixed_size_options = FixedSizeOptions(num_nodes=dict(
            v0=dataset["v0"].num_nodes, v1=dataset["v1"].num_nodes),
                                              num_edges={
                                                  ("v0", "e0", "v1"): 40,
                                                  ("v0", "e0", "v0"): 30,
                                                  ("v1", "e0", "v0"): 30,
                                                  ("v0", "e1", "v1"): 40,
                                                  ("v1", "e0", "v1"): 50,
                                              })

    fixed_size_collator = FixedSizeCollater(fixed_size_options)

    result = fixed_size_collator._prune_edges([dataset])

    assert len(result) == 1
    assert result[0].num_nodes == fixed_size_options.total_num_nodes
    assert result[0].num_edges == fixed_size_options.total_num_edges

    if is_data(type_):
        assert result[0].x.shape[0] == fixed_size_options.num_nodes
        assert result[0].pos.shape[0] == fixed_size_options.num_nodes
        assert result[0].edge_attr.shape[0] == fixed_size_options.num_edges
        assert result[0].edge_index.shape[1] == fixed_size_options.num_edges
    else:
        for edge_type, expected_num in fixed_size_options.num_edges.items():
            assert result[0][edge_type].edge_index.shape[1] == expected_num


def test_prune_edges_multiple_inputs(_get_test_data):
    type_, dataset = _get_test_data

    num_inputs = 4
    input = [dataset] * num_inputs

    if is_data(type_):
        fixed_size_options = FixedSizeOptions(num_nodes=dataset.num_nodes *
                                              num_inputs,
                                              num_edges=80,
                                              num_graphs=num_inputs + 1)
    else:
        fixed_size_options = FixedSizeOptions(num_nodes=dict(
            v0=dataset["v0"].num_nodes * num_inputs,
            v1=dataset["v1"].num_nodes * num_inputs),
                                              num_edges={
                                                  ("v0", "e0", "v1"): 80,
                                                  ("v0", "e0", "v0"): 120,
                                                  ("v1", "e0", "v0"): 90,
                                                  ("v0", "e1", "v1"): 100,
                                                  ("v1", "e0", "v1"): 80,
                                              })

    fixed_size_collator = FixedSizeCollater(fixed_size_options)

    result = fixed_size_collator._prune_edges(input)
    assert len(result) == num_inputs

    num_nodes = 0
    num_edges = 0
    for data in result:
        assert data.num_nodes > 0
        num_nodes += data.num_nodes

        assert data.num_edges > 0
        num_edges += data.num_edges

    assert num_nodes == fixed_size_options.total_num_nodes
    assert num_edges == fixed_size_options.total_num_edges


def test_prune_nodes_multiple_inputs_minimal_num_edges(_get_test_data):
    type_, dataset = _get_test_data

    num_inputs = 3
    input = [dataset] * num_inputs

    if is_data(type_):
        fixed_size_options = FixedSizeOptions(num_nodes=dataset.num_nodes *
                                              num_inputs,
                                              num_edges=80,
                                              num_graphs=num_inputs + 1)
    else:
        fixed_size_options = FixedSizeOptions(num_nodes=dict(
            v0=dataset["v0"].num_nodes * num_inputs,
            v1=dataset["v1"].num_nodes * num_inputs),
                                              num_edges={
                                                  ("v0", "e0", "v1"): 80,
                                                  ("v0", "e0", "v0"): 120,
                                                  ("v1", "e0", "v0"): 90,
                                                  ("v0", "e1", "v1"): 100,
                                                  ("v1", "e0", "v1"): 80,
                                              })

    fixed_size_collator = FixedSizeCollater(fixed_size_options)

    result = fixed_size_collator._prune_edges(input)
    assert len(result) == num_inputs

    num_nodes = 0
    num_edges = 0
    for data in result:
        assert data.num_nodes > 0
        num_nodes += data.num_nodes
        num_edges += data.num_edges

    assert num_nodes == fixed_size_options.total_num_nodes
    assert num_edges == fixed_size_options.total_num_edges


def test_prune_nodes_multiple_inputs_should_throw_exception(_get_test_data):
    type_, dataset = _get_test_data

    num_inputs = 3
    input = [dataset] * num_inputs
    expected_num_nodes = (num_inputs - 1)

    fixed_size_options = FixedSizeOptions(num_nodes=expected_num_nodes,
                                          num_graphs=num_inputs + 1)
    if not is_data(type_):
        fixed_size_options.to_hetero(dataset.node_types, dataset.edge_types)
    fixed_size_collater = FixedSizeCollater(fixed_size_options)

    with pytest.raises(RuntimeError):
        fixed_size_collater._prune_nodes(input)


@pytest.mark.parametrize('data_type,fixed_size_hetero', [(Data, False),
                                                         (HeteroData, False),
                                                         (HeteroData, True)])
def test_prune_nodes_fixed_size_collater(data_type, fixed_size_hetero,
                                         fake_hetero_dataset):
    batch_size = 10

    if is_data(data_type):
        avg_num_nodes = 30
        num_channels = 16
        dataset = pyg.datasets.FakeDataset(num_graphs=99,
                                           avg_num_nodes=avg_num_nodes,
                                           avg_degree=5,
                                           num_channels=num_channels,
                                           edge_dim=8)
    else:
        avg_num_nodes = 60
        dataset = fake_hetero_dataset

    if fixed_size_hetero:
        fixed_size_options = FixedSizeOptions(num_nodes=dict(v0=800, v1=800),
                                              num_graphs=batch_size + 1)
    else:
        fixed_size_options = FixedSizeOptions(num_nodes=800,
                                              num_graphs=batch_size + 1)

    fixed_size_collater = FixedSizeCollater(fixed_size_options,
                                            trim_nodes=True)
    batch_sampler = BatchSampler(RandomSampler(dataset),
                                 batch_size,
                                 drop_last=False)
    for sample in batch_sampler:
        result = fixed_size_collater([dataset[id] for id in sample])
        assert result.num_nodes == fixed_size_options.total_num_nodes
        assert result.num_edges == fixed_size_options.total_num_edges

        if is_data(data_type):
            assert result.batch.shape[0] == fixed_size_options.total_num_nodes
            assert result.x.shape[0] == fixed_size_options.total_num_nodes
            assert result.edge_attr.shape[
                0] == fixed_size_options.total_num_edges
            assert result.edge_index.shape[
                1] == fixed_size_options.total_num_edges
        else:
            for node_type, expected_val in fixed_size_options.num_nodes.items(
            ):
                assert result[node_type].num_nodes == expected_val
                assert result[node_type].x.shape[0] == expected_val
            for edge_type, expected_num in fixed_size_options.num_edges.items(
            ):
                assert result[edge_type].edge_index.shape[1] == expected_num


@pytest.mark.parametrize('data_type,fixed_size_hetero', [(Data, False),
                                                         (HeteroData, False),
                                                         (HeteroData, True)])
def test_prune_edges_fixed_size_collator(data_type, fixed_size_hetero,
                                         fake_hetero_dataset):
    batch_size = 10

    if is_data(data_type):
        avg_num_nodes = 30
        num_channels = 16
        dataset = pyg.datasets.FakeDataset(num_graphs=99,
                                           avg_num_nodes=avg_num_nodes,
                                           avg_degree=5,
                                           num_channels=num_channels,
                                           edge_dim=8)
    else:
        avg_num_nodes = 60
        dataset = fake_hetero_dataset

    if fixed_size_hetero:
        fixed_size_options = FixedSizeOptions(num_nodes=dict(
            v0=avg_num_nodes * (batch_size * 2),
            v1=avg_num_nodes * (batch_size * 2)),
                                              num_edges={
                                                  ("v0", "e0", "v1"): 80,
                                                  ("v0", "e0", "v0"): 120,
                                                  ("v1", "e0", "v0"): 90,
                                                  ("v0", "e1", "v1"): 100,
                                                  ("v1", "e0", "v1"): 80,
                                              },
                                              num_graphs=batch_size + 1)
    else:
        fixed_size_options = FixedSizeOptions(num_nodes=avg_num_nodes *
                                              (batch_size * 2),
                                              num_edges=30,
                                              num_graphs=batch_size + 1)

    fixed_size_collator = FixedSizeCollater(fixed_size_options,
                                            trim_edges=True)
    batch_sampler = BatchSampler(RandomSampler(dataset),
                                 batch_size,
                                 drop_last=False)
    for sample in batch_sampler:
        result = fixed_size_collator([dataset[id] for id in sample])

        assert result.num_nodes == fixed_size_options.total_num_nodes
        assert result.num_edges == fixed_size_options.total_num_edges

        if is_data(data_type):
            assert result.batch.shape[0] == fixed_size_options.total_num_nodes
            assert result.x.shape[0] == fixed_size_options.total_num_nodes
            assert result.edge_attr.shape[
                0] == fixed_size_options.total_num_edges
            assert result.edge_index.shape[
                1] == fixed_size_options.total_num_edges
        else:
            for node_type, expected_val in fixed_size_options.num_nodes.items(
            ):
                assert result[node_type].num_nodes == expected_val
                assert result[node_type].x.shape[0] == expected_val
            for edge_type, expected_num in fixed_size_options.num_edges.items(
            ):
                assert result[edge_type].edge_index.shape[1] == expected_num


@pytest.mark.parametrize('data_type,fixed_size_hetero', [(Data, False),
                                                         (HeteroData, False),
                                                         (HeteroData, True)])
def test_prune_data_fixed_size_collator(data_type, fixed_size_hetero,
                                        fake_hetero_dataset):
    batch_size = 10

    if is_data(data_type):
        avg_num_nodes = 30
        num_channels = 16
        dataset = pyg.datasets.FakeDataset(num_graphs=99,
                                           avg_num_nodes=avg_num_nodes,
                                           avg_degree=5,
                                           num_channels=num_channels,
                                           edge_dim=8)
    else:
        avg_num_nodes = 300
        dataset = fake_hetero_dataset

    if fixed_size_hetero:
        fixed_size_options = FixedSizeOptions(num_nodes=dict(v0=200, v1=100),
                                              num_edges={
                                                  ("v0", "e0", "v1"): 80,
                                                  ("v0", "e0", "v0"): 120,
                                                  ("v1", "e0", "v0"): 90,
                                                  ("v0", "e1", "v1"): 100,
                                                  ("v1", "e0", "v1"): 80,
                                              },
                                              num_graphs=batch_size + 1)
    else:
        fixed_size_options = FixedSizeOptions(num_nodes=200,
                                              num_edges=30,
                                              num_graphs=batch_size + 1)

    for data in dataset:
        if is_data(data_type):
            assert data.edge_index.shape[1] > 0
        else:
            for edge_store in data.edge_stores:
                assert edge_store['edge_index'].shape[1] > 0

    fixed_size_collator = FixedSizeCollater(fixed_size_options,
                                            trim_nodes=True,
                                            trim_edges=True)
    batch_sampler = BatchSampler(RandomSampler(dataset),
                                 batch_size,
                                 drop_last=False)
    for sample in batch_sampler:
        result = fixed_size_collator([dataset[id] for id in sample])

        assert result.num_nodes == fixed_size_options.total_num_nodes
        assert result.num_edges == fixed_size_options.total_num_edges

        if is_data(data_type):
            assert result.batch.shape[0] == fixed_size_options.total_num_nodes
            assert result.x.shape[0] == fixed_size_options.total_num_nodes
            assert result.edge_attr.shape[
                0] == fixed_size_options.total_num_edges
            assert result.edge_index.shape[
                1] == fixed_size_options.total_num_edges
        else:
            for node_type, expected_val in fixed_size_options.num_nodes.items(
            ):
                assert result[node_type].num_nodes == expected_val
                assert result[node_type].x.shape[0] == expected_val
            for edge_type, expected_num in fixed_size_options.num_edges.items(
            ):
                assert result[edge_type].edge_index.shape[1] == expected_num


def test_valid_args_fixed_size_collater(_get_test_data):
    _, dataset = _get_test_data

    num_inputs = 3
    expected_num_nodes = dataset.num_nodes * num_inputs

    fixed_size_options = FixedSizeOptions(num_nodes=expected_num_nodes,
                                          num_graphs=num_inputs + 1)
    fixed_size_collater = FixedSizeCollater(fixed_size_options)
    input_list = [dataset] * num_inputs
    fixed_size_collater(input_list)

    with pytest.raises(TypeError, match='Expected list, got tuple.'):
        fixed_size_collater(tuple(input_list))


def test_fixed_size_collater_should_include_non_tensor_keys_in_pad_graph(
        _get_test_data):
    _, dataset = _get_test_data

    dataset['scalar_key'] = 2
    expected_num_nodes = dataset.num_nodes * 3

    fixed_size_options = FixedSizeOptions(num_nodes=expected_num_nodes)
    fixed_size_collater = FixedSizeCollater(fixed_size_options)
    input_list = [dataset]
    result = fixed_size_collater(input_list)

    assert result.name == ['gdb_57518', 'gdb_57518']
    assert torch.equal(result.scalar_key, torch.Tensor([2, 2]))


def test_fixed_size_collater_should_assign_default_pad_values(_get_test_data):
    _, dataset = _get_test_data

    expected_num_nodes = dataset.num_nodes * 3
    dataset['scalar_key'] = 2
    pad_graph_defaults = {'name': 'pad_graph', 'scalar_key': 3}
    input_list = [dataset]

    fixed_size_options = FixedSizeOptions(
        num_nodes=expected_num_nodes, pad_graph_defaults=pad_graph_defaults)
    fixed_size_collater = FixedSizeCollater(fixed_size_options)
    result = fixed_size_collater(input_list)
    assert result.name == ['gdb_57518', 'pad_graph']
    assert torch.equal(result.scalar_key, torch.Tensor([2, 3]))


@pytest.mark.parametrize('num_nodes,num_edges,error_type',
                         [(10, 10000, 'nodes'), (10000, 10, 'edges')])
def test_fixed_size_collater_wrong_size_exceptions(_get_test_data, num_nodes,
                                                   num_edges, error_type):
    _, dataset = _get_test_data

    num_inputs = 4
    input = [dataset] * num_inputs
    fixed_size_options = FixedSizeOptions(num_nodes=num_nodes,
                                          num_edges=num_edges,
                                          num_graphs=num_inputs + 1)

    fixed_size_collater = FixedSizeCollater(fixed_size_options)

    error_contains = (
        r"The fixed sizes given don't allocate enough space for the"
        fr" number of .* {error_type}")

    with pytest.raises(RuntimeError, match=error_contains):
        # TODO: Be more specific about error
        fixed_size_collater(input)


================================================
FILE: tests/gnn/test_dataloader.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import inspect
import pickle

from functools import singledispatch

import pytest
import torch
from torch_geometric.data import Batch, Data, HeteroData
from torch_geometric.datasets import FakeDataset
from torch_geometric.transforms import Pad

import utils
from utils import is_data
from poptorch_geometric.stream_packing_sampler import StreamPackingSampler
from poptorch_geometric.collate import CombinedBatchingCollater, make_exclude_keys
from poptorch_geometric.dataloader import DataLoader as IPUDataLoader
from poptorch_geometric.dataloader import \
    FixedSizeDataLoader as IPUFixedSizeDataLoader
from poptorch_geometric.dataloader import FixedSizeStrategy, OverSizeStrategy
from poptorch_geometric.fixed_size_options import FixedSizeOptions
from poptorch_geometric.pyg_collate import Collater
from poptorch_geometric.pyg_dataloader import (DataLoader, FixedSizeDataLoader)
from poptorch_geometric.types import PyGArgsParser
from poptorch_geometric.common import DataBatch, HeteroDataBatch

import poptorch

# pylint: disable=protected-access


@singledispatch
def _compare_batches(batch_actual, batch_expected):
    raise ValueError(f'Unsupported data type: {type(batch_actual)}')


@_compare_batches.register
def _(batch_actual: DataBatch, batch_expected: DataBatch):
    for key in batch_expected.keys:
        expected_value = batch_expected[key]
        actual_value = batch_actual[key]
        if isinstance(expected_value, torch.Tensor):
            assert torch.equal(actual_value, expected_value)
        else:
            assert actual_value == expected_value


@_compare_batches.register
def _(batch_actual: HeteroDataBatch, batch_expected: HeteroDataBatch):
    for actual, expected in zip(batch_actual._global_store.values(),
                                batch_expected._global_store.values()):
        assert actual == expected

    def compare_stores(actual, expected):
        for a, e in zip(actual, expected):
            for act, exp in zip(a.values(), e.values()):
                assert act.tolist() == exp.tolist()

    compare_stores(batch_actual.node_stores, batch_expected.node_stores)
    compare_stores(batch_actual.edge_stores, batch_expected.edge_stores)


@pytest.mark.parametrize('dataset',
                         ['fake_small_dataset', 'fake_hetero_dataset'])
def test_batch_serialization(dataset, request):
    dataset = request.getfixturevalue(dataset)
    data = dataset[0]
    batch = Batch.from_data_list([data])
    serialized_batch = pickle.dumps(batch)
    batch_unserialized = pickle.loads(serialized_batch)
    _compare_batches(batch_unserialized, batch)


@pytest.mark.parametrize('dataset',
                         ['fake_small_dataset', 'fake_hetero_dataset'])
def test_custom_batch_parser(dataset, request):
    dataset = request.getfixturevalue(dataset)
    data = dataset[0]
    batch = Batch.from_data_list([data])
    parser = PyGArgsParser()
    generator = parser.yieldTensors(batch)
    batch_reconstructed = parser.reconstruct(batch, generator)
    _compare_batches(batch_reconstructed, batch)


@pytest.mark.parametrize('data', ['molecule', 'fake_hetero_data'])
def test_collater(data, request):
    data = request.getfixturevalue(data)
    if isinstance(data, Data):
        include_keys = ('x', 'y', 'z')
    else:
        include_keys = ('x')

    exclude_keys = make_exclude_keys(include_keys, data)
    collate_fn = Collater(exclude_keys=exclude_keys)
    batch = collate_fn([data])
    data_type = type(data)
    assert isinstance(batch, type(Batch(_base_cls=data_type)))
    batch_keys = list(
        filter(lambda key: key not in ('ptr', 'batch', 'edge_index'),
               batch.keys))

    assert len(batch_keys) == len(include_keys)

    for key in include_keys:
        if is_data(data_type):
            utils.assert_equal(actual=batch[key], expected=getattr(data, key))
            utils.assert_equal(actual=getattr(batch, key),
                               expected=getattr(data, key))
        else:
            for b_store, d_store in zip(batch.node_stores, data.node_stores):
                utils.assert_equal(actual=b_store[key],
                                   expected=getattr(d_store, key))
                utils.assert_equal(actual=getattr(b_store, key),
                                   expected=getattr(d_store, key))


@pytest.mark.parametrize('data', ['molecule', 'fake_hetero_data'])
def test_multiple_collater(data, request):
    r"""Test that we can have two different collaters at the same time and
    that attribute access works as expected."""
    data = request.getfixturevalue(data)

    include_keys = ('x', )
    exclude_keys = make_exclude_keys(include_keys, data)
    indclude_keys_2 = ('z', )
    exclude_keys_2 = make_exclude_keys(indclude_keys_2, data)
    batch = Collater(exclude_keys=exclude_keys)([data])
    batch_2 = Collater(exclude_keys=exclude_keys_2)([data])

    for k1, k2 in zip(include_keys, indclude_keys_2):
        assert k1 in batch.keys
        assert k2 not in batch.keys
        assert k1 not in batch_2.keys
        if is_data(type(data)):
            assert k2 in batch_2.keys


@pytest.mark.parametrize('data', ['molecule', 'fake_hetero_data'])
def test_collater_invalid_keys(data, request):
    data = request.getfixturevalue(data)
    if not isinstance(data, Data):
        data['y'] = torch.zeros(1)
        expected_keys = ['edge_index', 'x', 'y']
    else:
        expected_keys = [
            'edge_index', 'pos', 'y', 'idx', 'z', 'edge_attr', 'x'
        ]

    data_type = type(data)

    exclude_keys = ('v', 'name')
    collate_fn = Collater(exclude_keys=exclude_keys)

    batch = collate_fn([data])
    assert isinstance(batch, type(Batch(_base_cls=data_type)))
    batch_keys = list(
        filter(lambda key: key not in ('ptr', 'batch'), batch.keys))

    assert len(expected_keys) == len(batch_keys)
    if is_data(data_type):
        for key in expected_keys:
            utils.assert_equal(actual=batch[key], expected=getattr(data, key))
            utils.assert_equal(actual=getattr(batch, key),
                               expected=getattr(data, key))
    else:

        def check(batch_stores, data_stores, key):
            for b_store, d_store in zip(batch_stores, data_stores):
                utils.assert_equal(actual=b_store[key],
                                   expected=getattr(d_store, key))
                utils.assert_equal(actual=getattr(b_store, key),
                                   expected=getattr(d_store, key))

        key = 'edge_index'
        check(batch.edge_stores, data.edge_stores, key)
        key = 'x'
        check(batch.node_stores, data.node_stores, key)
        key = 'y'
        check((batch._global_store, ), (data._global_store, ), key)


@pytest.mark.parametrize('data', ['molecule', 'fake_hetero_data'])
@pytest.mark.parametrize('mini_batch_size', [1, 16])
def test_combined_batching_collater(mini_batch_size, data, request):
    data = request.getfixturevalue(data)

    # Simulates 4 replicas.
    num_replicas = 4
    combined_batch_size = num_replicas * mini_batch_size
    data_list = [data] * combined_batch_size
    collate_fn = CombinedBatchingCollater(mini_batch_size=mini_batch_size,
                                          collater=Collater())
    batch = collate_fn(data_list)
    for key, v in batch.items():
        if isinstance(v, torch.Tensor):
            if key == 'batch':
                size = sum(d.num_nodes for d in data_list)
                assert v.shape[0] == size
            elif key == 'ptr':
                assert v.shape[0] == (mini_batch_size + 1) * num_replicas
            else:
                if key == 'edge_index':
                    assert v.shape[0] == num_replicas * 2
                    assert v.shape[
                        1] == data.edge_index.shape[1] * mini_batch_size
                else:
                    size = sum(d[key].shape[0] for d in data_list)
                    assert v.shape[0] == size


def test_combined_batching_collater_invalid(molecule):
    collate_fn = CombinedBatchingCollater(mini_batch_size=8,
                                          collater=Collater())

    with pytest.raises(AssertionError, match='Invalid batch size'):
        collate_fn([molecule] * 9)


def test_simple_fixed_size_data_loader_mro(num_graphs=2, num_nodes=40):
    # Check that MROs of the dataloader classes are correct. There are other
    # classes that inherit from `FixedSizeDataLoader` and would be
    # affected if the MRO changes here.
    dataset = FakeDataset(num_graphs=num_graphs, avg_num_nodes=30)

    fixed_size_options = FixedSizeOptions(num_nodes=num_nodes,
                                          num_graphs=num_graphs)

    pyg_dataloader = FixedSizeDataLoader(dataset,
                                         fixed_size_options=fixed_size_options,
                                         batch_size=num_graphs)

    mro = inspect.getmro(type(pyg_dataloader))
    # MRO is longer but it's enough to check these classes.
    expected_mro = (FixedSizeDataLoader, torch.utils.data.DataLoader)
    num_classes = len(expected_mro)
    assert mro[:num_classes] == expected_mro

    ipu_dataloader = IPUFixedSizeDataLoader(
        dataset=dataset,
        fixed_size_options=fixed_size_options,
        batch_size=num_graphs)
    mro = inspect.getmro(type(ipu_dataloader))
    # MRO is longer but it's enough to check these classes.
    expected_mro = (IPUFixedSizeDataLoader, FixedSizeDataLoader,
                    poptorch.DataLoader, torch.utils.data.DataLoader)
    num_classes = len(expected_mro)
    assert mro[:num_classes] == expected_mro


@pytest.mark.parametrize('loader', [
    FixedSizeDataLoader,
    dict(loader_cls=IPUFixedSizeDataLoader, device_iterations=3),
    dict(loader_cls=IPUFixedSizeDataLoader)
])
@pytest.mark.parametrize(
    'fixed_size_strategy',
    [FixedSizeStrategy.PadToMax, FixedSizeStrategy.StreamPack])
@pytest.mark.parametrize('dataset', ['pyg_qm9', 'fake_node_task_dataset'])
def test_fixed_size_dataloader(loader,
                               fixed_size_strategy,
                               benchmark,
                               dataset,
                               request,
                               batch_size=10):
    dataset = request.getfixturevalue(dataset)

    ipu_dataloader = loader is not FixedSizeDataLoader
    # CombinedBatchingCollater adds an additional 0-th dimension.
    dim_offset = 0

    device_iterations = loader.get(
        'device_iterations',
        poptorch.Options().device_iterations) if ipu_dataloader else 1

    # Get a sensible value for the the maximum number of nodes.
    padded_num_nodes = dataset[0].num_nodes * (batch_size + 20)
    padded_num_edges = dataset[0].num_edges * padded_num_nodes

    # Define the expected tensor sizes in the output.
    data = dataset[0]
    data_attributes = (k for k, _ in data()
                       if data.is_node_attr(k) or data.is_edge_attr(k))
    expected_sizes = {
        k: ((padded_num_nodes if data.is_node_attr(k) else padded_num_edges) *
            device_iterations, dim_offset)
        for k in data_attributes
    }
    # Special case for edge_index which is of shape [2, num_edges].
    expected_sizes['edge_index'] = (device_iterations * 2, dim_offset)

    # Special case for `y` being graph-lvl label
    if not data.is_node_attr('y'):
        expected_sizes['y'] = (batch_size * device_iterations, dim_offset)

    # Create a fixed size dataloader.
    kwargs = {
        'dataset':
        dataset,
        'batch_size':
        batch_size,
        'fixed_size_options':
        FixedSizeOptions(num_nodes=padded_num_nodes,
                         num_edges=padded_num_edges,
                         num_graphs=batch_size),
        'fixed_size_strategy':
        fixed_size_strategy
    }

    if ipu_dataloader:
        options = poptorch.Options()
        options.deviceIterations(device_iterations=device_iterations)
        kwargs['options'] = options
        loader = loader['loader_cls']

    loader = loader(**kwargs)

    # Check that each batch matches the expected size.
    loader_iter = iter(loader)
    repeats = 10
    for _ in range(repeats):
        batch = next(loader_iter)
        assert hasattr(batch, 'batch')
        assert hasattr(batch, 'ptr')

        if ipu_dataloader:
            assert list(batch.batch.size()) == [
                device_iterations * padded_num_nodes,
            ]
            if not fixed_size_strategy == FixedSizeStrategy.StreamPack:
                assert list(batch.ptr.size()) == [
                    device_iterations * (batch_size + 1),
                ]
        else:
            assert list(batch.batch.size()) == [padded_num_nodes]
            if not fixed_size_strategy == FixedSizeStrategy.StreamPack:
                assert list(batch.ptr.size()) == [batch_size + 1]

        sizes_match = all(
            getattr(batch, k).shape[dim] == size
            for k, (size, dim) in expected_sizes.items())
        assert sizes_match

    def loop():
        loader_iter = iter(loader)
        for _ in range(repeats):
            next(loader_iter)

    benchmark(loop)


@pytest.mark.parametrize('loader', [
    FixedSizeDataLoader,
    dict(loader_cls=IPUFixedSizeDataLoader, device_iterations=3),
    dict(loader_cls=IPUFixedSizeDataLoader)
])
@pytest.mark.parametrize(
    'fixed_size_strategy',
    [FixedSizeStrategy.PadToMax, FixedSizeStrategy.StreamPack])
@pytest.mark.parametrize(
    'dataset', ['fake_hetero_dataset', 'fake_node_task_hetero_dataset'])
@pytest.mark.parametrize('fixed_size_options,requires_trimming',
                         [(FixedSizeOptions(
                             num_nodes={
                                 "v0": 500,
                                 "v1": 1000,
                             },
                             num_edges={
                                 ("v0", "e0", "v1"): 5000,
                                 ("v0", "e0", "v0"): 6000,
                                 ("v1", "e0", "v0"): 7000,
                                 ("v0", "e1", "v1"): 8000,
                                 ("v1", "e0", "v1"): 9000,
                             },
                             num_graphs=10,
                         ), False),
                          (FixedSizeOptions(
                              num_nodes=1000,
                              num_edges={
                                  ("v0", "e0", "v1"): 5000,
                                  ("v0", "e0", "v0"): 6000,
                                  ("v1", "e0", "v0"): 7000,
                                  ("v0", "e1", "v1"): 8000,
                                  ("v1", "e0", "v1"): 9000,
                              },
                              num_graphs=10,
                          ), False),
                          (FixedSizeOptions(
                              num_nodes={
                                  "v0": 500,
                                  "v1": 1000,
                              },
                              num_edges=8000,
                              num_graphs=10,
                          ), False),
                          (FixedSizeOptions(
                              num_nodes={
                                  "v0": 100,
                                  "v1": 200,
                              },
                              num_edges={
                                  ("v0", "e0", "v1"): 2000,
                                  ("v0", "e0", "v0"): 300,
                                  ("v1", "e0", "v0"): 1000,
                                  ("v0", "e1", "v1"): 100,
                                  ("v1", "e0", "v1"): 3000,
                              },
                              num_graphs=10,
                          ), True)])
def test_fixed_size_heterodataloader(
        loader,
        fixed_size_strategy,
        benchmark,
        dataset,
        fixed_size_options,
        requires_trimming,
        request,
):
    dataset = request.getfixturevalue(dataset)
    ipu_dataloader = loader is not FixedSizeDataLoader

    batch_size = fixed_size_options.num_graphs

    device_iterations = loader.get(
        'device_iterations',
        poptorch.Options().device_iterations) if ipu_dataloader else 1

    # Create a fixed size dataloader.
    kwargs = {
        'dataset': dataset,
        'batch_size': batch_size,
        'fixed_size_options': fixed_size_options,
        'fixed_size_strategy': fixed_size_strategy,
    }

    if ipu_dataloader:
        options = poptorch.Options()
        options.deviceIterations(device_iterations=device_iterations)
        kwargs['options'] = options
        loader = loader['loader_cls']

    fixed_size_loader = loader(**kwargs)

    if requires_trimming:
        with pytest.raises(RuntimeError):
            next(iter(fixed_size_loader))
        fixed_size_loader = loader(
            over_size_strategy=OverSizeStrategy.TrimNodesAndEdges, **kwargs)

    for batch in fixed_size_loader:
        for node_attr in filter(is_iterable, batch.node_stores):
            check_batch_and_ptr(node_attr)

        assert batch.num_nodes == fixed_size_options.total_num_nodes
        assert batch.num_edges == fixed_size_options.total_num_edges
        assert 'num_nodes' not in batch.node_types
        assert 'num_edges' not in batch.edge_types

        if 'y' in batch._node_store_dict.keys():
            assert batch.y.shape[0] == batch_size * device_iterations
        assert batch.graphs_mask.shape[0] == batch_size * device_iterations

        assert sum(node_attr.batch.shape[0]
                   for node_attr in filter(is_iterable, batch.node_stores)
                   ) == fixed_size_options.total_num_nodes * device_iterations
        if not fixed_size_strategy == FixedSizeStrategy.StreamPack:
            assert {
                node_attr.ptr.shape[0]
                for node_attr in filter(is_iterable, batch.node_stores)
            } == {device_iterations * (batch_size + 1)}

        # Check sizes for some of the items in the batch
        for node_type in fixed_size_options.num_nodes:
            assert batch[node_type].x.shape[0] == fixed_size_options.num_nodes[
                node_type] * device_iterations
            assert batch[node_type].batch.shape[
                0] == fixed_size_options.num_nodes[
                    node_type] * device_iterations
            assert batch[node_type].nodes_mask.shape[
                0] == fixed_size_options.num_nodes[
                    node_type] * device_iterations
        for edge_type in fixed_size_options.num_edges:
            # Checking num of edges with second dimension so it is not a multiple
            # of device iterations.
            assert batch[edge_type].edge_index.shape[
                1] == fixed_size_options.num_edges[edge_type]
            assert batch[edge_type].edges_mask.shape[
                0] == fixed_size_options.num_edges[
                    edge_type] * device_iterations

    def loop():
        for _ in fixed_size_loader:
            pass

    benchmark(loop)


@pytest.mark.parametrize('num_edges', [None, 500])
@pytest.mark.parametrize('num_graphs', [2, 10])
@pytest.mark.parametrize(
    'fixed_size_strategy',
    [FixedSizeStrategy.PadToMax, FixedSizeStrategy.StreamPack])
def test_dataloader_trims_to_fixed_sizes(num_edges, num_graphs,
                                         fixed_size_strategy,
                                         fake_molecular_dataset):
    num_nodes = num_graphs * 30
    dataset_size = 123
    dataset = fake_molecular_dataset[:dataset_size]

    fixed_size_options = FixedSizeOptions(num_nodes=num_nodes,
                                          num_edges=num_edges,
                                          num_graphs=num_graphs)

    train_dataloader = FixedSizeDataLoader(
        dataset,
        fixed_size_options=fixed_size_options,
        batch_size=num_graphs,
        fixed_size_strategy=fixed_size_strategy,
        over_size_strategy=OverSizeStrategy.TrimNodesAndEdges)

    batch = next(iter(train_dataloader))
    attrs = [
        attr for attr in batch.keys if isinstance(batch[attr], torch.Tensor)
    ]
    for data in train_dataloader:
        for attr in attrs:
            assert batch[attr].shape == data[attr].shape


def is_iterable(src):
    return hasattr(src, '__iter__')


def check_batch_and_ptr(src):
    assert 'batch' in src
    assert 'ptr' in src


@pytest.mark.parametrize('dataset',
                         ['fake_molecular_dataset', 'fake_hetero_dataset'])
def test_dataloader(dataset, request, batch_size=10):
    dataset = request.getfixturevalue(dataset)
    loader = DataLoader(dataset=dataset, batch_size=batch_size)

    for idx, batch in enumerate(loader):
        if isinstance(batch, HeteroDataBatch):
            for node_attr in filter(is_iterable, batch.node_stores):
                check_batch_and_ptr(node_attr)
        else:
            check_batch_and_ptr(batch)

        # Check that each batch matches the expected size.
        idx_range = slice(idx * batch_size, (idx + 1) * batch_size)
        assert batch.num_graphs == batch_size
        assert batch.num_nodes == sum(d.num_nodes for d in dataset[idx_range])
        assert batch.num_edges == sum(d.num_edges for d in dataset[idx_range])

        # Split batch to the list of data and compare with the data from the
        # dataset.
        data_list = batch.to_data_list()

        def check_data_types(original, new):
            if isinstance(original, torch.Tensor):
                assert original.dtype == new.dtype
            else:
                for o, n in zip(original.values(), new.values()):
                    check_data_types(o, n)

        for original, new in zip(dataset[idx_range], data_list):
            assert set(new.keys) == set(original.keys)

            for o, n in zip(original.to_dict().values(),
                            new.to_dict().values()):
                check_data_types(o, n)

            for key in original.keys:
                if not isinstance(original[key], torch.Tensor):
                    assert new[key] == original[key]
                else:
                    assert torch.all(torch.eq(new[key], original[key]))


@pytest.mark.parametrize('dataset',
                         ['fake_molecular_dataset', 'fake_hetero_dataset'])
@pytest.mark.parametrize('device_iterations', [None, 3])
def test_pad_transform_with_dataloader(
        device_iterations,
        dataset,
        request,
        batch_size=3,
):
    """Tests the pattern of using a Pad transform and a non-fixed-size
       data loader as an approach to achieve fixed size batches"""
    dataset = request.getfixturevalue(dataset)
    is_HeteroData = isinstance(dataset[0], HeteroData)
    if is_HeteroData:
        max_num_nodes = 300
        max_num_edges = 1500

        def check(b_idx, torch_batch, batch):
            for t, b in zip(torch_batch.node_stores, batch.node_stores):
                assert set(t.keys()) == set(b.keys())
                for key in t.keys():
                    if isinstance(t[key], torch.Tensor):
                        shape_dim = t[key].shape[0]
                        slc = slice(b_idx * shape_dim, (b_idx + 1) * shape_dim)
                        assert all((b[key][slc] == t[key]).tolist())
                    else:
                        assert b[key] == t[key]
    else:
        max_num_nodes = 30
        max_num_edges = 150
        dataset = dataset[:123]

        def check(b_idx, torch_batch, batch):
            assert set(torch_batch.keys).issubset(set(batch.keys))
            for key in torch_batch.keys:
                if isinstance(torch_batch[key], torch.Tensor):
                    shape_dim = torch_batch[key].shape[0]
                    slc = slice(b_idx * shape_dim, (b_idx + 1) * shape_dim)
                    if isinstance(batch[key], torch.Tensor):
                        assert all(
                            (batch[key][slc] == torch_batch[key]).tolist())
                    else:
                        assert sum(torch_batch[key].tolist()) == batch[key]
                else:
                    assert batch[key] == torch_batch[key]

    dataset.transform = Pad(max_num_nodes=max_num_nodes,
                            max_num_edges=max_num_edges)

    options = poptorch.Options()
    if device_iterations is not None:
        options.deviceIterations(device_iterations=device_iterations)

    loader = IPUDataLoader(dataset=dataset,
                           batch_size=batch_size,
                           options=options)

    # Create PyG's dataloader to compare the created batches.
    pyg_loader = DataLoader(dataset=dataset, batch_size=batch_size)
    torch_loader_iter = iter(pyg_loader)

    for idx, batch in enumerate(loader):
        if is_HeteroData:
            for node_attr in filter(is_iterable, batch.node_stores):
                check_batch_and_ptr(node_attr)
        else:
            check_batch_and_ptr(batch)

        # Check that each batch matches the expected size.
        idx_range = slice(idx * batch_size, (idx + 1) * batch_size)
        assert batch.num_graphs == batch_size
        assert batch.num_nodes == sum(d.num_nodes for d in dataset[idx_range])
        assert batch.num_edges == sum(d.num_edges for d in dataset[idx_range])
        num_batches = device_iterations or 1

        # Compare batches from PyG's and PopPyG's dataloaders.
        torch_batches = [next(torch_loader_iter) for _ in range(num_batches)]

        for b_idx, torch_batch in enumerate(torch_batches):
            check(b_idx, torch_batch, batch)


@pytest.mark.parametrize('dataset',
                         ['fake_molecular_dataset', 'fake_hetero_dataset'])
@pytest.mark.parametrize('allow_skip_data', [True, False])
def test_dataloader_with_sampler_num_nodes(allow_skip_data, dataset, request):
    num_node_types = 2 if dataset == 'fake_hetero_dataset' else 1
    dataset = request.getfixturevalue(dataset)
    num_nodes = 1000
    if isinstance(dataset[0], Data):
        dataset = dataset[:10]
        num_nodes = 100

    sampler = StreamPackingSampler(dataset,
                                   max_num_graphs=1,
                                   max_num_nodes=num_nodes,
                                   allow_skip_data=allow_skip_data)

    num_nodes = num_nodes + 1

    fixed_size_options = FixedSizeOptions(num_nodes=num_nodes)

    dataloader = FixedSizeDataLoader(dataset,
                                     fixed_size_options=fixed_size_options,
                                     batch_sampler=sampler)

    for batch in dataloader:
        assert batch.num_nodes == num_nodes * num_node_types


@pytest.mark.parametrize('create_loader',
                         [FixedSizeDataLoader, IPUFixedSizeDataLoader])
def test_fixed_size_dataloader_num_created_batches_stream_packing(
        create_loader):
    total_num_graphs = 100
    ds = FakeDataset(num_graphs=total_num_graphs, avg_num_nodes=10)
    total_num_nodes = sum(d.num_nodes for d in ds)
    total_num_edges = sum(d.num_edges for d in ds)

    # Loader should create 10 batches of 11 graphs each (10 real + 1 padding
    # graph).
    expected_num_batches = 10
    padded_batch_size = 11
    fixed_size_options = FixedSizeOptions(num_nodes=total_num_nodes,
                                          num_graphs=padded_batch_size)
    loader = create_loader(ds,
                           batch_size=padded_batch_size,
                           fixed_size_options=fixed_size_options,
                           fixed_size_strategy=FixedSizeStrategy.StreamPack)
    batches_created = sum(1 for _ in loader)

    assert batches_created == expected_num_batches

    # Loader should create only 1 batch since there is space for all graphs
    # and one padding graph.
    expected_num_batches = 1
    fixed_size_options = FixedSizeOptions(num_nodes=total_num_nodes + 1,
                                          num_edges=total_num_edges + 1,
                                          num_graphs=101)
    loader = create_loader(ds,
                           batch_size=101,
                           fixed_size_options=fixed_size_options,
                           fixed_size_strategy=FixedSizeStrategy.StreamPack)
    batches_created = sum(1 for _ in loader)

    assert batches_created == expected_num_batches

    # There is no space for padding graph in the first batch (not enough
    # graphs) so loader should create two batches.
    expected_num_batches = 2
    fixed_size_options = FixedSizeOptions(num_nodes=total_num_nodes + 1,
                                          num_edges=total_num_edges + 1,
                                          num_graphs=100)
    loader = create_loader(ds,
                           batch_size=100,
                           fixed_size_options=fixed_size_options,
                           fixed_size_strategy=FixedSizeStrategy.StreamPack)
    batches_created = sum(1 for _ in loader)

    assert batches_created == expected_num_batches

    # There is no space for padding graph in the first batch (not enough
    # nodes) so loader should create two batches.
    expected_num_batches = 2
    fixed_size_options = FixedSizeOptions(num_nodes=total_num_nodes,
                                          num_edges=total_num_edges + 1,
                                          num_graphs=101)
    loader = create_loader(ds,
                           batch_size=101,
                           fixed_size_options=fixed_size_options,
                           fixed_size_strategy=FixedSizeStrategy.StreamPack)
    batches_created = sum(1 for _ in loader)

    assert batches_created == expected_num_batches

    # There is no space for padding graph in the first batch (not enough
    # edges) so loader should create two batches.
    expected_num_batches = 2
    fixed_size_options = FixedSizeOptions(num_nodes=total_num_nodes + 1,
                                          num_edges=total_num_edges,
                                          num_graphs=101)
    loader = create_loader(ds,
                           batch_size=101,
                           fixed_size_options=fixed_size_options,
                           fixed_size_strategy=FixedSizeStrategy.StreamPack)
    batches_created = sum(1 for _ in loader)

    assert batches_created == expected_num_batches


def test_fixed_size_dataloader_with_default_values(fake_large_dataset):
    ds = fake_large_dataset
    batch_size = 10
    padded_batch_size = batch_size + 1
    # The default value of `num_nodes` should be large enough so it's possible
    # to always pick 10 graphs and create additional padding graph.
    loader = FixedSizeDataLoader(ds, batch_size=padded_batch_size)
    expected_batches = 10

    num_batches = sum(1 for _ in loader)
    assert expected_batches == num_batches

    # DataLoader should correctly capture the number of nodes from sampler.
    sampler = StreamPackingSampler(ds, max_num_graphs=batch_size)
    loader = FixedSizeDataLoader(ds,
                                 batch_size=padded_batch_size,
                                 batch_sampler=sampler)

    num_batches = 0
    for batch in loader:
        assert batch.num_nodes == sampler.max_num_nodes + 1
        num_batches += 1
    assert expected_batches == num_batches


@pytest.mark.parametrize('create_loader',
                         [FixedSizeDataLoader, IPUFixedSizeDataLoader])
def test_fixed_size_dataloader_with_custom_batch_sampler(create_loader):
    total_num_graphs = 20
    batch_size = 5
    ds = FakeDataset(num_graphs=total_num_graphs, avg_num_nodes=10)

    class DummySampler:
        def __init__(self, data_source, batch_size):
            self.data_source = data_source
            self.batch_size = batch_size

        def __iter__(self):
            for _ in range(len(self)):
                yield [0] * self.batch_size

        def __len__(self):
            return len(self.data_source) // self.batch_size

    sampler = DummySampler(ds, batch_size - 1)

    with pytest.raises(ValueError):
        loader = create_loader(
            ds,
            batch_size=5,
            batch_sampler=sampler,
            fixed_size_strategy=FixedSizeStrategy.StreamPack)

    loader = FixedSizeDataLoader(ds,
                                 batch_size=batch_size,
                                 batch_sampler=sampler)

    num_batches = sum(1 for _ in loader)
    assert num_batches == 5


================================================
FILE: tests/gnn/test_encoding.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import torch
from torch_geometric.nn import PositionalEncoding, TemporalEncoding
from gnn.nn.nn_utils import op_harness


def test_positional_encoding():
    encoder = PositionalEncoding(64)

    x = torch.tensor([1.0, 2.0, 3.0])

    op_harness(encoder, [x])


def test_temporal_encoding():
    encoder = TemporalEncoding(64)

    x = torch.tensor([1.0, 2.0, 3.0])

    op_harness(encoder, [x])


================================================
FILE: tests/gnn/test_fixed_size_options.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import pytest

import torch_geometric as pyg
from torch_geometric.data import HeteroData
from torch_geometric.loader import DataLoader, NeighborLoader

from poptorch_geometric.fixed_size_options import FixedSizeOptions


@pytest.mark.parametrize('dataset,expected_result',
                         [('fake_large_dataset',
                           FixedSizeOptions(
                               num_nodes=109,
                               num_edges=1099,
                               num_graphs=10,
                           )),
                          ('fake_hetero_dataset',
                           FixedSizeOptions(
                               num_nodes={
                                   "v0": 559,
                                   "v1": 559
                               },
                               num_edges={
                                   ("v0", "e0", "v0"): 5212,
                                   ("v1", "e0", "v1"): 5176,
                                   ("v0", "e0", "v1"): 5239,
                                   ("v1", "e0", "v0"): 5149,
                                   ("v0", "e1", "v1"): 5176,
                               },
                               num_graphs=10,
                           ))])
def test_fixed_size_options_from_dataset(dataset, expected_result, request):
    dataset = request.getfixturevalue(dataset)

    batch_size = 10
    fixed_size_options = FixedSizeOptions.from_dataset(dataset, batch_size)

    assert fixed_size_options.num_nodes == expected_result.num_nodes
    assert fixed_size_options.num_edges == expected_result.num_edges
    assert fixed_size_options.num_graphs == expected_result.num_graphs

    # With sample limit
    fixed_size_options = FixedSizeOptions.from_dataset(dataset,
                                                       batch_size,
                                                       sample_limit=10000)

    assert fixed_size_options.num_nodes == expected_result.num_nodes
    assert fixed_size_options.num_edges == expected_result.num_edges
    assert fixed_size_options.num_graphs == expected_result.num_graphs


@pytest.mark.parametrize('dataset,expected_result',
                         [('fake_large_dataset',
                           FixedSizeOptions(
                               num_nodes=116,
                               num_edges=1015,
                               num_graphs=11,
                           )),
                          ('fake_hetero_dataset',
                           FixedSizeOptions(
                               num_nodes={
                                   "v0": 543,
                                   "v1": 523
                               },
                               num_edges={
                                   ("v0", "e0", "v0"): 4950,
                                   ("v1", "e0", "v1"): 4766,
                                   ("v0", "e0", "v1"): 4897,
                                   ("v1", "e0", "v0"): 4667,
                                   ("v0", "e1", "v1"): 4914,
                               },
                               num_graphs=11,
                           ))])
def test_fixed_size_options_from_dataloader(dataset, expected_result, request):
    dataset = request.getfixturevalue(dataset)

    batch_size = 10
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    fixed_size_options = FixedSizeOptions.from_loader(dataloader)

    assert fixed_size_options.num_nodes == expected_result.num_nodes
    assert fixed_size_options.num_edges == expected_result.num_edges
    assert fixed_size_options.num_graphs == expected_result.num_graphs

    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    # With sample limit
    fixed_size_options = FixedSizeOptions.from_loader(dataloader,
                                                      sample_limit=1000)

    assert fixed_size_options.num_nodes == expected_result.num_nodes
    assert fixed_size_options.num_edges == expected_result.num_edges
    assert fixed_size_options.num_graphs == expected_result.num_graphs


@pytest.mark.parametrize('dataset,expected_result',
                         [('fake_node_task_dataset',
                           FixedSizeOptions(
                               num_nodes=13,
                               num_edges=61,
                               num_graphs=2,
                           )),
                          ('fake_node_task_hetero_dataset',
                           FixedSizeOptions(
                               num_nodes={
                                   "v0": 62,
                                   "v1": 43
                               },
                               num_edges={
                                   ("v0", "e0", "v0"): 146,
                                   ("v1", "e0", "v1"): 115,
                                   ("v0", "e0", "v1"): 116,
                                   ("v1", "e0", "v0"): 139,
                                   ("v0", "e1", "v1"): 116,
                               },
                               num_graphs=2,
                           ))])
def test_fixed_size_options_from_sample_dataloader(dataset, expected_result,
                                                   request):
    dataset = request.getfixturevalue(dataset)
    is_HeteroData = isinstance(dataset[0], HeteroData)

    pyg.seed_everything(42)
    dataloader = NeighborLoader(dataset[0], [5, 5],
                                batch_size=5,
                                shuffle=False,
                                input_nodes=("v0",
                                             None) if is_HeteroData else None)

    fixed_size_options = FixedSizeOptions.from_loader(dataloader)

    assert fixed_size_options.num_nodes == expected_result.num_nodes
    assert fixed_size_options.num_edges == expected_result.num_edges
    assert fixed_size_options.num_graphs == expected_result.num_graphs

    pyg.seed_everything(42)
    dataloader = NeighborLoader(dataset[0], [5, 5],
                                batch_size=5,
                                shuffle=False,
                                input_nodes=("v0",
                                             None) if is_HeteroData else None)

    # With sample limit
    fixed_size_options = FixedSizeOptions.from_loader(dataloader,
                                                      sample_limit=1000)

    assert fixed_size_options.num_nodes == expected_result.num_nodes
    assert fixed_size_options.num_edges == expected_result.num_edges
    assert fixed_size_options.num_graphs == expected_result.num_graphs


def test_fixed_size_options_to_hetero(request):
    dataset = request.getfixturevalue("fake_hetero_dataset")

    batch_size = 10
    num_nodes = 20
    num_edges = 40
    fixed_size_options = FixedSizeOptions(num_nodes=num_nodes,
                                          num_edges=num_edges,
                                          num_graphs=batch_size)
    fixed_size_options.to_hetero(dataset[0].node_types, dataset[0].edge_types)

    assert all(n == num_nodes for n in fixed_size_options.num_nodes.values())
    assert all(n == num_edges for n in fixed_size_options.num_edges.values())
    assert fixed_size_options.num_graphs == batch_size


================================================
FILE: tests/gnn/test_masker.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import pytest
import torch
import torch_geometric as pyg

from poptorch_geometric import masker


@pytest.fixture(params=[True, False])
def entries(request) -> masker.Entries:
    """Returns something which looks like an entry"""
    pyg.seed_everything(1)
    is_tuple = request.param
    entry = torch.rand([2, 3, 4])
    return (entry, entry) if is_tuple else entry


class TestNoOpMasker:
    """Tests the No Op masker, makes sure it does nothing."""

    @pytest.mark.parametrize("masker_name", ["node", "graph", "edge"])
    def test_masker_does_not_change_the_object(self, masker_name: str,
                                               entries: masker.Entries):
        mask = masker.NoMasker()
        output_entries = getattr(mask, f"{masker_name}_masker")(entries)
        assert entries is output_entries


class TestNoOpLayerMasker:
    @pytest.fixture
    def layer(self):
        def layer_function(*args):
            total = 0
            for arg in args:
                total += torch.sum(arg)
            return total

        return layer_function

    @pytest.mark.parametrize("masker_name", ["node", "graph", "edge"])
    def test_masker_does_not_change_the_layer_result(
            self,
            masker_name: str,
            entries: masker.Entries,
            layer: masker.Layer,
    ):
        mask = masker.PreLayerMasker(masker=masker.NoMasker())
        masked_layer = getattr(mask, f"{masker_name}_masker")(layer)
        if not isinstance(entries, (tuple, list)):
            entries = (entries, )
        reference_output = layer(*entries)
        masked_output = masked_layer(*entries)
        assert reference_output == masked_output, (
            "For the No-op layer masker," +
            " the result of a layer should be unchanged")


================================================
FILE: tests/gnn/test_model_args.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import pytest
import torch
import torch.nn.functional as F
from torch_geometric import seed_everything
from torch_geometric.data import Batch
from torch_geometric.datasets import FakeDataset
from torch_geometric.nn.models import MLP

from utils import assert_equal
# Need to import poptorch_geometric to ensure that our arg parser implementation is
# registered with poptorch ahead of running these tests
import poptorch_geometric  # pylint: disable=unused-import
import poptorch


class Model(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.mlp = MLP([in_channels, out_channels])

    def forward(self, example):
        example.h = self.mlp(example.x)
        example.out = F.log_softmax(example.h, dim=1)

        if self.training:
            pred = example.out[example.train_mask]
            target = example.y[example.train_mask]
            example.loss = F.cross_entropy(pred, target)

        return example


def add_train_mask(data):
    # Add a train_mask property that contains indices
    num_training_nodes = int(0.8 * data.num_nodes)
    data.train_mask = torch.randperm(data.num_nodes)[:num_training_nodes]
    return data


def data():
    seed_everything(0)
    dataset = FakeDataset(transform=add_train_mask,
                          avg_num_nodes=32,
                          num_channels=8)
    data = dataset[0]
    in_channels = data.x.shape[-1]
    out_channels = dataset.num_classes

    return data, in_channels, out_channels


def batch():
    seed_everything(0)
    dataset = FakeDataset(num_graphs=4,
                          transform=add_train_mask,
                          avg_num_nodes=12,
                          num_channels=8)
    data = dataset[0]
    in_channels = data.x.shape[-1]
    out_channels = dataset.num_classes
    batch = Batch.from_data_list(dataset[:])
    return batch, in_channels, out_channels


@pytest.fixture
def dispatcher_options():
    options = poptorch.Options()
    return options


@pytest.mark.parametrize('arg', [data(), batch()], ids=['data', 'batch'])
def test_args(arg, dispatcher_options):
    arg, in_channels, out_channels = arg

    if isinstance(arg, Batch):
        pytest.skip("Known issue. Unblock when AFS-97 will be completed.")

    model = Model(in_channels, out_channels)
    model.train()
    optimizer = poptorch.optim.Adam(model.parameters(), lr=0.001)
    model = poptorch.trainingModel(model=model,
                                   options=dispatcher_options,
                                   optimizer=optimizer)

    output = model(arg)
    assert isinstance(output, type(arg)), \
        "Model output must have the same type as input argument"

    # Check that all the keys from the input argument are also present on the
    # output argument.
    for k in arg.keys:
        assert k in output

    # Check that all the keys that were added in the model are present on the
    # output argument.
    for k in ['h', 'out', 'loss']:
        assert k in output

    if isinstance(arg, Batch):
        # Check that the batch vector is preserved but omit the dtype since
        # the PopTorch dispatcher will coerce long -> int32
        assert_equal(output.batch, arg.batch, check_dtype=False)
        assert output.batch.dtype == torch.int32


================================================
FILE: tests/gnn/test_neighbor_loader.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

import numpy as np
import pytest
import torch

from torch_geometric.data import Data, HeteroData
from torch_geometric.loader import NeighborLoader
from torch_geometric.sampler.base import SubgraphType
from torch_geometric.testing import (
    get_random_edge_index,
    onlyNeighborSampler,
)
from torch_geometric.utils import (
    is_undirected, )

from poptorch_geometric.fixed_size_options import FixedSizeOptions
from poptorch_geometric.neighbor_loader import FixedSizeNeighborLoader


def validate_fixed_data_format(loader: FixedSizeNeighborLoader,
                               fixed_size_options: FixedSizeOptions,
                               is_hetero_data: bool,
                               debug_print: bool = False):

    for index in range(0, len(loader), loader.batch_size):  # pylint: disable=too-many-nested-blocks

        indices = list(range(index, index + loader.batch_size))
        dynamic = loader.nativeCollate([indices])
        fixed = loader.fixedSizeCollate(dynamic)

        if is_hetero_data:
            dynamic_dict_store = {
                '_node_store_dict': dynamic.__dict__['_node_store_dict'],
                '_edge_store_dict': dynamic.__dict__['_edge_store_dict']
            }
            fixed_dict_store = {
                '_node_store_dict': fixed.__dict__['_node_store_dict'],
                '_edge_store_dict': fixed.__dict__['_edge_store_dict']
            }
        else:
            dynamic_dict_store = {
                '_store': {
                    "Data:": dynamic.__dict__['_store'].__dict__['_mapping']
                }
            }
            fixed_dict_store = {
                '_store': {
                    "Data:": fixed.__dict__['_store'].__dict__['_mapping']
                }
            }

        for storage_type in dynamic_dict_store:
            if debug_print:
                print(f"Store [{storage_type}]")

            if storage_type == '_edge_store_dict':
                pad_value = fixed_size_options.edge_pad_value
            else:
                pad_value = fixed_size_options.node_pad_value

            dynamic_dict_group = dynamic_dict_store[storage_type]
            fixed_dict_group = fixed_dict_store[storage_type]

            for group in dynamic_dict_group:
                if debug_print:
                    print(f"Group [{group}]")

                dynamic_dict = dynamic_dict_group[group]
                fixed_dict = fixed_dict_group[group]

                # check if values are padded as expected
                for key in dynamic_dict:

                    # Batch size is used only for sampling
                    if key == 'batch_size':
                        continue

                    dynamic_tensor = dynamic_dict[key]
                    fixed_tensor = fixed_dict[key]

                    if debug_print:
                        print(f"Key: [{key}]")
                        print("Dynamic:", dynamic_tensor)
                        print("Fixed  :", fixed_tensor)

                    if dynamic_tensor.dim() < 2:
                        dynamic_tensor = [dynamic_tensor]
                        fixed_tensor = [fixed_tensor]

                    for i in range(0, len(dynamic_tensor)):  # pylint: disable=consider-using-enumerate
                        dynamic_dim = dynamic_tensor[i]
                        fixed_dim = fixed_tensor[i]
                        valid_range = range(
                            0, min(len(dynamic_dim), len(fixed_dim)))
                        fixed_range = range(len(valid_range), len(fixed_dim))

                        for j in valid_range:
                            assert dynamic_dim[j] == fixed_dim[j]

                        # Dummy (padded) edge_index should point to dummy node
                        if key == 'edge_index':
                            if is_hetero_data:
                                n_id_tensor = fixed_dict_store[  # pylint: disable=line-too-long
                                    '_node_store_dict'][
                                        group[0 if i < 1 else -1]]['n_id']
                            else:
                                assert fixed_size_options.num_edges == len(
                                    fixed_dim), f"Incorrect padding for {key}"
                                n_id_tensor = fixed_dict['n_id']
                            for j in fixed_range:
                                assert n_id_tensor[fixed_dim[j]] == pad_value
                        # Dummy (padded) value check
                        else:
                            for j in fixed_range:
                                assert fixed_dim[j] == pad_value


def is_subset(subedge_index, edge_index, src_idx, dst_idx):
    num_nodes = int(edge_index.max()) + 1
    idx = num_nodes * edge_index[0] + edge_index[1]
    subidx = num_nodes * src_idx[subedge_index[0]] + dst_idx[subedge_index[1]]
    mask = torch.from_numpy(np.isin(subidx, idx))
    return int(mask.sum()) == mask.numel()


@onlyNeighborSampler
@pytest.mark.parametrize('subgraph_type', list(SubgraphType))
def test_homo_neighbor_loader_basic(subgraph_type):

    torch.manual_seed(12345)

    data = Data()

    data.x = torch.arange(15)
    data.edge_index = get_random_edge_index(15, 15, 75, torch.int64)
    data.edge_attr = torch.arange(75)
    use_batch_size = 5

    default_loader = NeighborLoader(
        data,
        num_neighbors=[5] * 2,
        batch_size=use_batch_size,
        subgraph_type=subgraph_type,
    )

    fixed_size_options = FixedSizeOptions.from_loader(default_loader)

    loader = FixedSizeNeighborLoader(
        data,
        num_neighbors=[5] * 2,
        batch_size=use_batch_size,
        subgraph_type=subgraph_type,
        fixed_size_options=fixed_size_options,
    )

    validate_fixed_data_format(loader=loader,
                               fixed_size_options=fixed_size_options,
                               is_hetero_data=False)

    assert len(loader) == len(data.x) // use_batch_size

    batch = next(iter(loader))

    assert isinstance(batch, Data)
    assert batch.n_id[:1].tolist() == [0]

    for i, batch in enumerate(loader):
        assert isinstance(batch, Data)
        assert batch.x.size(0) <= 101
        assert batch.n_id.size() == (batch.num_nodes, )
        assert batch.x.min() >= 0 and batch.x.max() < 101
        assert batch.edge_index.min() >= 0
        assert batch.edge_index.max() < batch.num_nodes

        # Input nodes are always sampled first:
        assert torch.equal(
            batch.x[:use_batch_size],
            torch.arange(i * use_batch_size, (i + 1) * use_batch_size))

        if subgraph_type != SubgraphType.bidirectional:
            assert batch.edge_attr.min() >= 0
            assert batch.edge_attr.max() < 500

            assert is_subset(
                batch.edge_index.to(torch.int64),
                data.edge_index.to(torch.int64),
                batch.x,
                batch.x,
            )


@onlyNeighborSampler
@pytest.mark.parametrize('subgraph_type', list(SubgraphType))
def test_hetero_neighbor_loader_basic(subgraph_type):
    dtype = torch.int64

    torch.manual_seed(12345)

    data = HeteroData()

    data['paper'].x = torch.arange(15)
    data['author'].x = torch.arange(15, 45)

    edge_index = get_random_edge_index(15, 15, 45, dtype)
    data['paper', 'paper'].edge_index = edge_index
    data['paper', 'paper'].edge_attr = torch.arange(45)
    edge_index = get_random_edge_index(15, 30, 90, dtype)
    data['paper', 'author'].edge_index = edge_index
    data['paper', 'author'].edge_attr = torch.arange(45, 135)
    edge_index = get_random_edge_index(30, 15, 150, dtype)
    data['author', 'paper'].edge_index = edge_index
    data['author', 'paper'].edge_attr = torch.arange(200, 250)

    batch_size = 2

    with pytest.raises(ValueError, match="hops must be the same across all"):
        default_loader = NeighborLoader(
            data,
            num_neighbors={
                ('paper', 'to', 'paper'): [-1],
                ('paper', 'to', 'author'): [-1, -1],
                ('author', 'to', 'paper'): [-1, -1],
            },
            input_nodes='paper',
            batch_size=batch_size,
            subgraph_type=subgraph_type,
        )

        fixed_size_options = FixedSizeOptions.from_loader(default_loader)

        loader = FixedSizeNeighborLoader(
            data,
            num_neighbors={
                ('paper', 'to', 'paper'): [-1],
                ('paper', 'to', 'author'): [-1, -1],
                ('author', 'to', 'paper'): [-1, -1],
            },
            input_nodes='paper',
            batch_size=batch_size,
            subgraph_type=subgraph_type,
            fixed_size_options=fixed_size_options,
        )
        next(iter(loader))

    default_loader = NeighborLoader(
        data,
        num_neighbors=[10] * 2,
        input_nodes='paper',
        batch_size=batch_size,
        subgraph_type=subgraph_type,
    )

    fixed_size_options = FixedSizeOptions.from_loader(default_loader)

    loader = FixedSizeNeighborLoader(data,
                                     num_neighbors=[10] * 2,
                                     input_nodes='paper',
                                     batch_size=batch_size,
                                     subgraph_type=subgraph_type,
                                     fixed_size_options=fixed_size_options)
    assert len(loader) > 0

    validate_fixed_data_format(loader=loader,
                               fixed_size_options=fixed_size_options,
                               is_hetero_data=True)


@onlyNeighborSampler
@pytest.mark.parametrize('subgraph_type', list(SubgraphType))
def test_hetero_neighbor_loader_large(subgraph_type):
    dtype = torch.int64

    torch.manual_seed(12345)

    data = HeteroData()

    data['paper'].x = torch.arange(20)
    data['author'].x = torch.arange(20, 220)

    edge_index = get_random_edge_index(20, 20, 40, dtype)
    data['paper', 'paper'].edge_index = edge_index
    data['paper', 'paper'].edge_attr = torch.arange(40)
    edge_index = get_random_edge_index(20, 50, 250, dtype)
    data['paper', 'author'].edge_index = edge_index
    data['paper', 'author'].edge_attr = torch.arange(40, 300)
    edge_index = get_random_edge_index(50, 20, 250, dtype)
    data['author', 'paper'].edge_index = edge_index
    data['author', 'paper'].edge_attr = torch.arange(300, 400)

    batch_size = 2

    with pytest.raises(ValueError, match="hops must be the same across all"):
        default_loader = NeighborLoader(
            data,
            num_neighbors={
                ('paper', 'to', 'paper'): [-1],
                ('paper', 'to', 'author'): [-1, -1],
                ('author', 'to', 'paper'): [-1, -1],
            },
            input_nodes='paper',
            batch_size=batch_size,
            subgraph_type=subgraph_type,
        )

        fixed_size_options = FixedSizeOptions.from_loader(default_loader)

        loader = FixedSizeNeighborLoader(
            data,
            num_neighbors={
                ('paper', 'to', 'paper'): [-1],
                ('paper', 'to', 'author'): [-1, -1],
                ('author', 'to', 'paper'): [-1, -1],
            },
            input_nodes='paper',
            batch_size=batch_size,
            subgraph_type=subgraph_type,
            fixed_size_options=fixed_size_options,
        )
        next(iter(loader))

    default_loader = NeighborLoader(
        data,
        num_neighbors=[10] * 2,
        input_nodes='paper',
        batch_size=batch_size,
        subgraph_type=subgraph_type,
    )

    fixed_size_options = FixedSizeOptions.from_loader(default_loader)

    loader = FixedSizeNeighborLoader(data,
                                     num_neighbors=[10] * 2,
                                     input_nodes='paper',
                                     batch_size=batch_size,
                                     subgraph_type=subgraph_type,
                                     add_pad_masks=True,
                                     fixed_size_options=fixed_size_options)
    assert len(loader) > 0

    validate_fixed_data_format(loader=loader,
                               fixed_size_options=fixed_size_options,
                               is_hetero_data=True)

    for batch in loader:
        assert isinstance(batch, HeteroData)

        # Test node type selection:
        assert set(batch.node_types) == {'paper', 'author'}

        assert batch['paper'].n_id.size() == (batch['paper'].num_nodes, )
        assert batch['paper'].x.size(0) <= 20 + 1
        assert batch['paper'].x.min() >= 0 and batch['paper'].x.max() < 40 + 1

        assert batch['author'].n_id.size() == (batch['author'].num_nodes, )
        assert batch['author'].x.size(0) <= 50
        assert batch['author'].x.max() < 220

        # Test edge type selection:
        assert set(batch.edge_types) == {('paper', 'to', 'paper'),
                                         ('paper', 'to', 'author'),
                                         ('author', 'to', 'paper')}

        row, col = batch['paper', 'paper'].edge_index
        assert row.min() >= 0 and row.max() < batch['paper'].num_nodes
        assert col.min() >= 0 and col.max() < batch['paper'].num_nodes

        if subgraph_type != SubgraphType.bidirectional:
            assert batch['paper', 'paper'].e_id.size() == (row.numel(), )
            value = batch['paper', 'paper'].edge_attr
            assert value.min() >= 0 and value.max() < 40

            assert is_subset(
                batch['paper', 'paper'].edge_index.to(
                    torch.int64)[:, batch['paper', 'paper'].edges_mask],
                data['paper', 'paper'].edge_index.to(torch.int64),
                batch['paper'].x,
                batch['paper'].x,
            )
        elif subgraph_type != SubgraphType.directional:
            assert 'e_id' not in batch['paper', 'paper']  # pylint: disable=no-value-for-parameter
            assert 'edge_attr' not in batch['paper', 'paper']  # pylint: disable=no-value-for-parameter

            assert is_undirected(batch['paper', 'paper'].edge_index)  # pylint: disable=no-value-for-parameter

        row, col = batch['paper', 'author'].edge_index
        assert row.min() >= 0 and row.max() < batch['paper'].num_nodes
        assert col.min() >= 0 and col.max() < batch['author'].num_nodes


================================================
FILE: tests/gnn/test_register_custom_args.py
================================================
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import unittest

import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

import helpers
import poptorch


class GCN(torch.nn.Module):
    def __init__(self, in_channels: int, out_channels: int):
        super().__init__()
        self.conv1 = GCNConv(in_channels, 16, add_self_loops=False)
        self.conv2 = GCNConv(16, out_channels, add_self_loops=False)

    def forward(self, data):
        x = data.x
        edge_index = data.edge_index

        x = self.conv1(x, edge_index).relu()
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index).relu()
        x = F.log_softmax(x, dim=1)

        return x


@unittest.mock.patch.dict("os.environ", helpers.disableSmallModel())
def test_register_custom_parsers(planetoid_cora):
    data = planetoid_cora[0]
    model = GCN(planetoid_cora.num_node_features, planetoid_cora.num_classes)
    model.eval()
    poptorch_model = poptorch.inferenceModel(model)
    result = poptorch_model(data)
    assert result is not None


================================================
FILE: tests/gnn/test_stream_packing_sampler.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import itertools
import math
from statistics import mean

import pytest
import torch
from utils import FakeDatasetEqualGraphs, is_data
from torch.utils.data.sampler import RandomSampler, SequentialSampler
from torch_geometric.data import Data, HeteroData
from torch_geometric.datasets import FakeDataset

from poptorch_geometric.stream_packing_sampler import StreamPackingSampler
from poptorch_geometric.collate import CombinedBatchingCollater, FixedSizeCollater
from poptorch_geometric.dataloader import FixedSizeDataLoader
from poptorch_geometric.fixed_size_options import FixedSizeOptions
from poptorch_geometric.pyg_dataloader import FixedSizeStrategy, OverSizeStrategy


def test_stream_packing_sampler_default_params():
    num_graphs = 10
    dataset = FakeDataset(num_graphs=num_graphs,
                          avg_num_nodes=30,
                          avg_degree=5)
    sampler = StreamPackingSampler(dataset, max_num_graphs=1)
    length = sum(1 for _ in itertools.chain(sampler))

    assert length == num_graphs


def test_stream_packing_sampler_should_throw_exception():
    num_graphs = 3
    dataset = FakeDataset(num_graphs=num_graphs,
                          avg_num_nodes=30,
                          avg_degree=5)
    sampler = StreamPackingSampler(dataset,
                                   max_num_graphs=2,
                                   max_num_nodes=2,
                                   allow_skip_data=False)
    with pytest.raises(RuntimeError):
        samples = []
        for sample in sampler:
            samples.append(sample)

    sampler = StreamPackingSampler(dataset,
                                   max_num_graphs=2,
                                   max_num_edges=2,
                                   allow_skip_data=False)
    with pytest.raises(RuntimeError):
        samples = []
        for sample in sampler:
            samples.append(sample)


def test_stream_packing_sampler_should_not_throw_exception():
    num_graphs = 4
    dataset = FakeDataset(num_graphs=num_graphs,
                          avg_num_nodes=30,
                          avg_degree=5)
    sampler = StreamPackingSampler(dataset,
                                   max_num_graphs=2,
                                   max_num_nodes=2,
                                   allow_skip_data=True)
    length = sum(1 for _ in sampler)
    assert length == 0

    sampler = StreamPackingSampler(dataset,
                                   max_num_graphs=2,
                                   max_num_edges=2,
                                   allow_skip_data=True)
    length = sum(1 for _ in sampler)
    assert length == 0


@pytest.mark.parametrize('data_type', [Data, HeteroData])
@pytest.mark.parametrize('shuffle', [True, False])
@pytest.mark.parametrize('batch_num_graphs', [2, 10])
@pytest.mark.parametrize('allow_skip_data', [True, False])
def test_stream_packing_should_return_valid_samples(data_type, shuffle,
                                                    batch_num_graphs,
                                                    allow_skip_data,
                                                    fake_hetero_dataset):
    if is_data(data_type):
        avg_num_nodes = 30
        dataset = FakeDataset(num_graphs=100,
                              avg_num_nodes=avg_num_nodes,
                              avg_degree=5,
                              num_channels=16,
                              edge_dim=8)
    else:
        avg_num_nodes = 50
        dataset = fake_hetero_dataset
    avg_num_edges = int(math.ceil(mean((data.num_edges for data in dataset))))

    batch_num_nodes = avg_num_nodes * batch_num_graphs + batch_num_graphs
    if not allow_skip_data:
        max_num_nodes = max(data.num_nodes for data in dataset)
        batch_num_nodes = max(batch_num_nodes,
                              max_num_nodes + batch_num_graphs)

    batch_num_edges = avg_num_edges * batch_num_graphs + batch_num_graphs
    if not allow_skip_data:
        max_num_edges = max(data.num_edges for data in dataset)
        batch_num_edges = max(batch_num_edges,
                              max_num_edges + batch_num_graphs)

    base_sampler = RandomSampler(dataset) if shuffle else \
        SequentialSampler(dataset)

    # Leave space for padding.
    sampler = StreamPackingSampler(dataset,
                                   max_num_graphs=batch_num_graphs - 1,
                                   max_num_nodes=batch_num_nodes - 1,
                                   max_num_edges=batch_num_edges - 1,
                                   base_sampler=base_sampler,
                                   allow_skip_data=allow_skip_data)
    length = sum(1 for _ in sampler)
    assert length > 0
    if not allow_skip_data:
        for sample in sampler:
            assert len(sample) <= batch_num_graphs


@pytest.mark.parametrize('data_type', [Data, HeteroData])
@pytest.mark.parametrize('shuffle', [True, False])
@pytest.mark.parametrize('allow_skip_data', [True, False])
@pytest.mark.parametrize('torch_data_loader', [True, False])
def test_stream_packing_sampler_should_be_usable_with_torch_data_loader(
        data_type, shuffle, allow_skip_data, torch_data_loader,
        fake_hetero_dataset):

    batch_num_graphs = 10
    num_channels = 16
    edge_dim = 8
    num_graphs = 10
    if is_data(data_type):
        avg_num_nodes = 30
        dataset = FakeDataset(num_graphs=100,
                              avg_num_nodes=avg_num_nodes,
                              avg_degree=5,
                              num_channels=num_channels,
                              edge_dim=8)
    else:
        avg_num_nodes = 50
        dataset = fake_hetero_dataset
    avg_num_edges = math.ceil(mean(data.num_edges for data in dataset))

    base_sampler = RandomSampler(dataset) if shuffle else \
        SequentialSampler(dataset)

    batch_num_nodes = avg_num_nodes * batch_num_graphs + batch_num_graphs
    if not allow_skip_data:
        max_num_nodes = max(data.num_nodes for data in dataset)
        batch_num_nodes = max(batch_num_nodes,
                              max_num_nodes + batch_num_graphs)

    batch_num_edges = avg_num_edges * batch_num_graphs + batch_num_graphs
    if not allow_skip_data:
        max_num_edges = max(data.num_edges for data in dataset)
        batch_num_edges = max(batch_num_edges,
                              max_num_edges + batch_num_graphs)

    fixed_size_options = FixedSizeOptions(num_nodes=batch_num_nodes,
                                          num_edges=batch_num_edges,
                                          num_graphs=num_graphs,
                                          node_pad_value=0.0,
                                          edge_pad_value=0.0,
                                          graph_pad_value=0.0)

    # Leave space for padding.
    if torch_data_loader:
        batch_sampler = StreamPackingSampler(dataset,
                                             max_num_graphs=num_graphs - 1,
                                             max_num_nodes=batch_num_nodes - 1,
                                             max_num_edges=batch_num_edges - 1,
                                             base_sampler=base_sampler,
                                             allow_skip_data=allow_skip_data)

        collater = CombinedBatchingCollater(
            FixedSizeCollater(fixed_size_options=fixed_size_options,
                              add_masks_to_batch=True))

        dataloader = torch.utils.data.DataLoader(dataset,
                                                 batch_sampler=batch_sampler,
                                                 collate_fn=collater)
    else:
        dataloader = FixedSizeDataLoader(
            dataset,
            batch_size=num_graphs,
            fixed_size_options=fixed_size_options,
            fixed_size_strategy=FixedSizeStrategy.StreamPack,
            add_pad_masks=True,
            over_size_strategy=OverSizeStrategy.Skip
            if allow_skip_data else OverSizeStrategy.Error)

    expected_x_shape = torch.Size([batch_num_nodes, num_channels])
    expected_batch_shape = torch.Size([batch_num_nodes])
    expected_edge_attr_shape = torch.Size([batch_num_edges, edge_dim])
    expected_mask_attr_shape = torch.Size([batch_num_graphs])
    expected_edge_index_attr_shape = torch.Size([2, batch_num_edges])

    for data in dataloader:
        assert data.graphs_mask.shape == expected_mask_attr_shape
        if is_data(data_type):
            assert data.x.shape == expected_x_shape
            assert data.batch.shape == expected_batch_shape
            assert data.edge_attr.shape == expected_edge_attr_shape
            assert data.edge_index.shape == expected_edge_index_attr_shape
        else:
            num_node_types = 2
            num_edge_types = 5
            assert data.num_nodes == batch_num_nodes * num_node_types
            assert data.num_edges == batch_num_edges * num_edge_types


@pytest.mark.parametrize('shuffle', [True, False])
@pytest.mark.parametrize('allow_skip_data', [True, False])
def test_stream_packing_sampler_padding_not_needed(shuffle, allow_skip_data):

    num_graphs_in_dataset = 100
    num_nodes = 30
    batch_num_graphs = 10
    num_channels = 16
    edge_dim = 8

    dataset = FakeDatasetEqualGraphs(num_graphs=num_graphs_in_dataset,
                                     num_nodes=num_nodes,
                                     num_channels=num_channels,
                                     edge_dim=edge_dim)

    avg_num_edges = math.ceil(mean(data.num_edges for data in dataset))

    base_sampler = RandomSampler(dataset) if shuffle else \
        SequentialSampler(dataset)

    batch_num_nodes = num_nodes * batch_num_graphs
    if not allow_skip_data:
        max_num_nodes = max(data.num_nodes for data in dataset)
        batch_num_nodes = max(batch_num_nodes,
                              max_num_nodes + batch_num_graphs)

    batch_num_edges = avg_num_edges * batch_num_graphs
    if not allow_skip_data:
        max_num_edges = max(data.num_edges for data in dataset)
        batch_num_edges = max(batch_num_edges,
                              max_num_edges + batch_num_graphs)

    batch_sampler = StreamPackingSampler(dataset,
                                         max_num_graphs=batch_num_graphs,
                                         max_num_nodes=batch_num_nodes,
                                         max_num_edges=batch_num_edges,
                                         base_sampler=base_sampler,
                                         allow_skip_data=allow_skip_data)

    fixed_size_options = FixedSizeOptions(num_nodes=batch_num_nodes,
                                          num_edges=batch_num_edges,
                                          num_graphs=batch_num_graphs,
                                          node_pad_value=0.0,
                                          edge_pad_value=0.0,
                                          graph_pad_value=0.0)

    collator = CombinedBatchingCollater(
        FixedSizeCollater(fixed_size_options=fixed_size_options,
                          add_masks_to_batch=True))

    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_sampler=batch_sampler,
                                             collate_fn=collator)

    expected_x_shape = torch.Size([batch_num_nodes, num_channels])
    expected_batch_shape = torch.Size([batch_num_nodes])
    expected_edge_attr_shape = torch.Size([batch_num_edges, edge_dim])
    expected_mask_attr_shape = torch.Size([batch_num_graphs])
    expected_edge_index_attr_shape = torch.Size([2, batch_num_edges])

    total_graphs_from_dataloader = 0
    for data in dataloader:
        assert data.x.shape == expected_x_shape
        assert data.batch.shape == expected_batch_shape
        assert data.edge_attr.shape == expected_edge_attr_shape
        assert data.graphs_mask.shape == expected_mask_attr_shape
        assert data.edge_index.shape == expected_edge_index_attr_shape
        total_graphs_from_dataloader += data.num_graphs

    assert total_graphs_from_dataloader == num_graphs_in_dataset


================================================
FILE: tests/gnn/utils.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import functools
import random
import subprocess
from pathlib import Path
from typing import List, Optional, Union

import nbformat
import torch  # noqa F401
from nbconvert.preprocessors import ExecutePreprocessor
from torch.testing import assert_close
from torch_geometric.data import Data, HeteroData, InMemoryDataset
from torch_geometric.data.data import BaseData

assert_equal = functools.partial(assert_close, rtol=0., atol=0.)
DEFAULT_PROCESS_TIMEOUT_SECONDS = 40 * 60
REPO_ROOT = Path(__file__).resolve().parents[1]


def is_data(type_: BaseData):
    if type_ is Data:
        return True
    if type_ is HeteroData:
        return False
    raise f"Wrong data type: {type_}. Should be Data or HeteroData!"


class CalledProcessError(subprocess.CalledProcessError):
    """An error for subprocesses which captures stdout and stderr in the error
    message."""

    def __str__(self) -> str:
        return "{original_message}\n{stdout}\n{stderr}".format(
            original_message=super().__str__(),
            stdout=self.stdout,
            stderr=self.stderr)


def run_command_fail_explicitly(command: Union[str, List[str]], cwd: str,
                                **kwargs) -> str:
    """ Runs a command returning the output or failing with useful information
    Args:
        command: The command to execute, can also be a space separated string.
        cwd: The directory in which the command should be
            launched. If called by a pytest test function or method, this
            probably should be a `tmp_path` fixture.
        **kwargs: Additional keyword arguments are passed to
            `subprocess.check_output`.
    Returns:
        The standard output and error of the command if successfully executed.
    Raises:
        RuntimeError: If the subprocess command executes with a non-zero
            output.
    """
    DEFAULT_KWARGS = {
        "shell": isinstance(command, str) and " " in command,
        "stderr": subprocess.PIPE,
        "timeout": DEFAULT_PROCESS_TIMEOUT_SECONDS,
        "universal_newlines": True,
    }

    try:
        merged_kwargs = {**DEFAULT_KWARGS, **kwargs}
        out = subprocess.check_output(
            command,
            cwd=cwd,
            **merged_kwargs,
        )
    except subprocess.CalledProcessError as e:
        stdout = e.stdout
        stderr = e.stderr
        # type of the stdout stream will depend on the subprocess.
        # The python docs say decoding is to be handled at
        # application level.
        if hasattr(stdout, "decode"):
            stdout = stdout.decode("utf-8", errors="ignore")
        if hasattr(stderr, "decode"):
            stderr = stderr.decode("utf-8", errors="ignore")
        raise CalledProcessError(1, cmd=command, output=stdout,
                                 stderr=stderr) from e
    return out


class ExpectedError(Exception):
    """An error which is expected by the test suite, to be used
    when decorating tests:

        @pytest.mark.xfail(raises=ExpectedError)
        def test_something_that_needs_fixing():
            try:
                broken_fun()
            except Exception as e:
                # check that e matches a condition
                if check_cond(e):
                    raise ExpectedError("") from e
                raise  # otherwise raise the original unexpected error
    """


def run_notebook(notebook_filename, expected_error: str = "", cwd=REPO_ROOT):
    """helper to run notebooks which may or may not be expected to fail"""
    with open(notebook_filename) as f:
        nb = nbformat.read(f, as_version=4)
    ep = ExecutePreprocessor(timeout=600, kernel_name="python3")
    try:
        ep.preprocess(nb, {"metadata": {"path": f"{cwd}"}})
    except Exception as e:
        if (not expected_error) or (expected_error not in str(e)):
            raise
        raise ExpectedError(expected_error) from e


class FakeDatasetEqualGraphs(InMemoryDataset):  #pylint: disable=abstract-method
    r"""A fake dataset that returns randomly generated
    :class:`~torch_geometric.data.Data` objects with fixed graph size.

    Args:
        num_graphs (int): The number of graphs.
        num_nodes (int): The number of nodes in a graph.
        num_channels (int): The number of node features.
        edge_dim (int): The number of edge features.
        num_edges (int, optional): The number of edges in a graph.
            (default: :obj:`None`)
    """

    def __init__(self,
                 num_graphs: int,
                 num_nodes: int,
                 num_channels: int,
                 edge_dim: int,
                 num_edges: Optional[int] = None) -> None:
        if num_graphs < 1:
            raise RuntimeError("Can't create dataset with less than 1 graph.")

        super().__init__('.')

        self.num_nodes = num_nodes
        if num_edges is not None:
            self.num_edges = num_edges
        else:
            # Randomize number of edges in graph.
            self.num_edges = random.randint(num_nodes + 1,
                                            num_nodes * (num_nodes - 1))
        self.num_channels = num_channels
        self.edge_dim = edge_dim
        data_list = [self.generate_data() for _ in range(num_graphs)]
        self.data, self.slices = self.collate(data_list)

    def generate_data(self) -> Data:
        x = torch.rand(self.num_nodes, self.num_channels)
        edge_index = torch.randint(high=self.num_nodes,
                                   size=(2, self.num_edges))
        edge_attr = torch.rand(self.num_edges, self.edge_dim)

        # -100 is the default value of `ignore_index` in `nn.CrossEntropyLoss`.
        y = torch.tensor([-100]).long()

        return Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)


================================================
FILE: tests/grouping_scatters_gathers_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.

# Tests for PyG torch_scatter ops integration with PopTorch
import json
from torch import gather
import torch
import pytest
import helpers
import poptorch

if helpers.is_running_tests:
    from torch_scatter import scatter, scatter_log_softmax, scatter_softmax, scatter_std, scatter_add, scatter_max
else:

    def scatter():
        pass

    def scatter_log_softmax():
        pass

    def scatter_softmax():
        pass

    def scatter_std():
        pass

    def scatter_add():
        pass

    def scatter_max():
        pass


expected_ops_after_fuse = {
    'scatter': 2,
    'scatter_add': 2,
    'scatter_max': 2,
    'scatter_softmax': 3,
    'scatter_log_softmax': 3,
    'scatter_std': 3,
    'gather': 2
}

expected_group_size_after_fuse = {
    'scatter': 3,
    'scatter_add': 3,
    'scatter_max': 3,
    'scatter_softmax': 3,
    'scatter_log_softmax': 3,
    'scatter_std': 6,
    'gather': 3
}


def check_is_fused(poptorch_model, op_type, expected_group_size,
                   expected_num_ops):
    all_ops = json.loads(poptorch_model._debugGetPopartIR())['maingraph']  # pylint: disable=protected-access
    op_types = ("GroupedGather",
                "Gather") if op_type == "gather" else ("ScatterReduce", )
    ops = []
    for grouped_op_type in op_types:
        for op in all_ops:
            if op['type'] == grouped_op_type:
                ops.append(op)

    assert len(ops) == expected_num_ops
    assert int(ops[0]['attributes']['group_size']) == expected_group_size


def torch_fusible_model(func, src, index, dtype):

    # We do the shape inference from scatter here because we don't support
    # dynamic shaped tensors on the ipu

    dim = 0
    dim_size = int(index.max()) + 1

    class Model(torch.nn.Module):
        def forward(self, src, index, dtype):
            ones = torch.ones_like(src, dtype=dtype)
            two = torch.ones_like(src) * 2
            if func == gather:
                out = func(src, dim, index)
                out_ones = func(ones, dim, index)
                out_two = func(two, dim, index)
            else:
                out = func(src, index, dim_size=dim_size)
                out_ones = func(ones, index, dim_size=dim_size)
                out_two = func(two, index, dim_size=dim_size)
            if isinstance(out, tuple):
                out = out[0]
                out_ones = out_ones[0]
                out_two = out_two[0]

            src_updated = src - torch.sum(out)
            # Functions which should not be fused
            out_updated_s, _ = scatter_max(src_updated,
                                           index,
                                           dim_size=dim_size)
            out_updated_g = gather(src_updated, dim, index)
            out_updated_sum = torch.sum(out_updated_g) + torch.sum(
                out_updated_s)
            return (out_ones + out_two) / out_updated_sum

    model = Model()
    options = poptorch.Options()
    poptorch_model = poptorch.inferenceModel(model, options=options)

    ones = torch.ones_like(src, dtype=dtype)
    two = torch.ones_like(src) * 2

    if func == gather:
        native_out = func(src, dim, index)
        native_out_ones = func(ones, dim, index)
        native_out_two = func(two, dim, index)
    else:
        native_out = func(src, index, dim_size=dim_size)
        native_out_ones = func(ones, index, dim_size=dim_size)
        native_out_two = func(two, index, dim_size=dim_size)
    if isinstance(native_out, tuple):
        native_out = native_out[0]
        native_out_ones = native_out_ones[0]
        native_out_two = native_out_two[0]

    src_updated = src - torch.sum(native_out)
    native_out_updated_s, _ = scatter_max(src_updated, index)
    native_out_updated_g = gather(src_updated, dim, index)
    native_out_updated_sum = torch.sum(native_out_updated_s) + torch.sum(
        native_out_updated_g)

    expected_nat = (native_out_ones + native_out_two) / native_out_updated_sum

    ipu_out = poptorch_model(src, index, dtype)
    # Verify that the ops have been fused
    expected_num_ops = expected_ops_after_fuse[func.__name__]
    expected_group_size = expected_group_size_after_fuse[func.__name__]
    if dtype != torch.float32:
        expected_group_size = expected_group_size - 1
        expected_num_ops = expected_num_ops + 1
    check_is_fused(poptorch_model, func.__name__, expected_group_size,
                   expected_num_ops)

    helpers.assert_allclose(actual=torch.nan_to_num(ipu_out),
                            expected=torch.nan_to_num(expected_nat))


@pytest.mark.parametrize("shape", [(3, ), (3, 5), (3, 5, 5)])
@pytest.mark.parametrize("func", [
    scatter, scatter_add, scatter_max, scatter_softmax, scatter_log_softmax,
    scatter_std, gather
])
@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.int])
def test_fuse(shape, func, dtype):
    if dtype != torch.float32 and func in [
            scatter_softmax, scatter_log_softmax, scatter_std
    ]:
        pytest.skip("can only be computed with fp32 data types")

    torch.manual_seed(0)
    x = torch.rand(shape)

    ind = torch.randint(3, shape)

    torch_fusible_model(func, x, ind, dtype)


================================================
FILE: tests/gru_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import pytest
import torch
import helpers
import poptorch


@pytest.mark.parametrize("bias", [True, False])
@pytest.mark.parametrize("batch_first", [True, False])
def test_gru(bias, batch_first):
    length = 1
    batches = 3
    input_size = 5
    hidden_size = 7

    layers = 1
    directions = 1

    torch.manual_seed(42)
    if batch_first:
        inp = torch.randn(batches, length, input_size)
    else:
        inp = torch.randn(length, batches, input_size)
    h0 = torch.randn(layers * directions, batches, hidden_size)

    op = torch.nn.GRU(input_size,
                      hidden_size,
                      bias=bias,
                      batch_first=batch_first)

    out_fn = lambda x: x[0]
    model = helpers.ModelWithWeights(op, inp.shape, out_fn)

    poptorch_model = poptorch.trainingModel(model)

    (native_out, native_hn), _ = model((inp, h0))
    (poptorch_out, poptorch_hn), _ = poptorch_model((inp, h0))

    # Inference test - check outputs
    helpers.assert_allclose(actual=poptorch_out, expected=native_out)
    helpers.assert_allclose(actual=poptorch_hn, expected=native_hn)

    # Training test - check weights changed
    poptorch_model.assert_weights_changed()


================================================
FILE: tests/half_float_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import pytest
import torch
import helpers
import poptorch


def assert_same_type(inputs, model):
    native_out = model(inputs)

    pop_model = poptorch.inferenceModel(model)
    pop_out = pop_model(inputs)

    assert native_out.dtype == pop_out.dtype


def type_out_harness(inputs, forward_op):
    class Model(torch.nn.Module):
        def forward(self, x):
            return forward_op(x)

    model = Model()

    assert_same_type(inputs, model)


## Ones and Zeros tests ##

ones_zeros = [torch.ones, torch.zeros]

@pytest.mark.parametrize("op", ones_zeros)
def test_ones_zeros_default_resolved(op):
    def fw_op(input):
        return op((2, 3, 4), dtype=input.dtype,
                  device=helpers.outputDevice()) + input.to(input.dtype)

    type_out_harness(torch.tensor([1], dtype=torch.float16), fw_op)
    type_out_harness(torch.tensor([1], dtype=torch.float32), fw_op)


# The dtype will correctly resolve becuse it matches the input added
# All settings will match pytorch
@pytest.mark.parametrize("op", ones_zeros)
def test_ones_zeros_input_resolved_with_input_dtype(op):
    def fw_op(input):
        return op((2, 3, 4), dtype=input.dtype,
                  device=helpers.outputDevice()) + input

    type_out_harness(torch.tensor([1], dtype=torch.float16), fw_op)
    type_out_harness(torch.tensor([1], dtype=torch.float32), fw_op)


# The zeros/ones will resolve correctly becaue torch.float16 could not have been
# from a tensor which could have beeh half/float.
#
# Half and half to float:
# The output will always be float 16.
#
# Like pytorch:
# The output will be correct.


@pytest.mark.parametrize("op", ones_zeros)
def test_ones_zeros_input_resolved_always_float16(op):
    def fw_op(input):
        return op(
            (2, 3, 4), dtype=torch.float16,
            device=helpers.outputDevice()) + input

    type_out_harness(torch.tensor([1], dtype=torch.float16), fw_op)
    type_out_harness(torch.tensor([1], dtype=torch.float32), fw_op)


# The dtype will resolve to the same as input. In the float16 case, the
# ones/zeros will be wrongly generated as a float16.
#
# The output will always match input.
@pytest.mark.parametrize("op", ones_zeros)
def test_ones_zeros_input_resolved_always_float32(op):
    def fw_op(input):
        return op(
            (2, 3, 4), dtype=torch.float32,
            device=helpers.outputDevice()) + input

    type_out_harness(torch.tensor([1], dtype=torch.float16), fw_op)
    type_out_harness(torch.tensor([1], dtype=torch.float32), fw_op)


## torch.rand tests ##


def test_rand_default_resolved():
    def fw_op(input):
        return torch.rand(3, 5, 100, dtype=input.dtype)

    type_out_harness(torch.tensor([1], dtype=torch.float16), fw_op)
    type_out_harness(torch.tensor([1], dtype=torch.float32), fw_op)


#The dtype will correctly resolve becuse it matches the input added
def test_rand_default_input_resolved():
    def fw_op(input):
        return torch.rand(3, 5, 100, dtype=input.dtype) + input

    type_out_harness(torch.tensor([1], dtype=torch.float16), fw_op)
    type_out_harness(torch.tensor([1], dtype=torch.float32), fw_op)


# The type will resolve correctly because torch.float16 could not have been
# from a tensor which could have been half/float.
#
# Half and half to float:
# The output will always be float 16.
#
# Like pytorch:
# The output will be correct.
def test_rand_default_input_resolved_always_float16():
    def fw_op(input):
        return torch.rand(3, 5, 100, dtype=torch.float16) + input

    type_out_harness(torch.tensor([1], dtype=torch.float16), fw_op)
    type_out_harness(torch.tensor([1], dtype=torch.float32), fw_op)


## torch.normal tests ##


# The type will be resolved correctly as the mean and standard deviation are
# inputs to the op
def test_normal_mean_correctly_resolved():
    def fw_op(input_mean):
        return torch.normal(input_mean, 10.0)

    type_out_harness(torch.tensor([0.0], dtype=torch.float16), fw_op)
    type_out_harness(torch.tensor([0.0], dtype=torch.float32), fw_op)


# The type will be resolved correctly as the mean and standard deviation are
# inputs to the op
def test_normal_std_correctly_resolved():
    def fw_op(input_std):
        return torch.normal(0.0, input_std)

    type_out_harness(torch.tensor([10.0], dtype=torch.float16), fw_op)
    type_out_harness(torch.tensor([10.0], dtype=torch.float32), fw_op)


## torch.distributions.uniform.Uniform tests ##


# The type will always resolve to float32 as it is traced to torch.rand without
# the low and high input tensors (which become dead code)
def test_distributions_uniform():
    def fw_op(input_low):
        torch.manual_seed(42)
        ud = torch.distributions.uniform.Uniform(
            input_low, torch.tensor([10.0], dtype=torch.float32))
        return ud.sample((10, 10, 1000))

    type_out_harness(torch.tensor([1], dtype=torch.float16), fw_op)
    type_out_harness(torch.tensor([1], dtype=torch.float32), fw_op)


## torch.distributions.Normal tests ##


# The type will resolve correctly because the mean is an input
def test_distributions_normal_mean_correctly_resolved():
    def fw_op(input_mean):
        torch.manual_seed(42)
        ud = torch.distributions.Normal(input_mean, 10.0)
        return ud.sample((10, 10, 100))

    type_out_harness(torch.tensor([0.0], dtype=torch.float16), fw_op)
    type_out_harness(torch.tensor([0.0], dtype=torch.float32), fw_op)


def test_distributions_normal_std_correctly_resolved():
    def fw_op(input_std):
        torch.manual_seed(42)
        ud = torch.distributions.Normal(0.0, input_std)
        return ud.sample((10, 10, 100))

    type_out_harness(torch.tensor([10.0], dtype=torch.float16), fw_op)
    type_out_harness(torch.tensor([10.0], dtype=torch.float32), fw_op)


## tensor._uniform test #


# The type will resolve correctly because it is based on the input tensor
def test_uniform_correctly_resolved():
    def fw_op(input_tensor):
        torch.manual_seed(42)
        input_tensor = input_tensor + 0  # Ensure input is not modified in place
        return input_tensor.uniform_()

    type_out_harness(torch.empty((3, 4, 10), dtype=torch.float16), fw_op)
    type_out_harness(torch.empty((3, 4, 10), dtype=torch.float32), fw_op)


## tensor._normal test #


# The type will also resolve correctly because it is based on the input tensor
def test_normal_correctly_resolved():
    def fw_op(input_tensor):
        torch.manual_seed(42)
        input_tensor = input_tensor + 0  # Ensure input is not modified in place
        return input_tensor.normal_()

    type_out_harness(torch.empty((3, 4, 10), dtype=torch.float16), fw_op)
    type_out_harness(torch.empty((3, 4, 10), dtype=torch.float32), fw_op)


## tensor constant tests ##


# The type will resolve correctly because it is added to the input.
#
# The output will always be the same as the
def test_constant_correctly_resolved():
    def fw_op(input):
        return torch.tensor([1, 2, 3], dtype=input.dtype) + input

    type_out_harness(torch.tensor([3, 4, 8], dtype=torch.float16), fw_op)
    type_out_harness(torch.tensor([3, 4, 8], dtype=torch.float32), fw_op)


# The type will resolve to float16 always because the input is cast to float16
# The output will always be float 16.
def test_constant_add_float16():
    def fw_op(input):
        return torch.tensor([1, 2, 3], dtype=input.dtype) + input.to(
            torch.float16)

    type_out_harness(torch.tensor([3, 4, 8], dtype=torch.float16), fw_op)
    type_out_harness(torch.tensor([3, 4, 8], dtype=torch.float32), fw_op)


def test_constant_always_float32():
    def fw_op(input):
        return torch.tensor([1, 2, 3], dtype=torch.float32) + input

    type_out_harness(torch.tensor([3, 4, 8], dtype=torch.float16), fw_op)
    type_out_harness(torch.tensor([3, 4, 8], dtype=torch.float32), fw_op)


@pytest.mark.parametrize("conv", [True, False])
def test_float16_activations_float32_weights(conv):
    torch.manual_seed(42)

    if conv:
        input = torch.ones(1, 4, 4)
        model = torch.nn.Conv1d(4, 5, 2)
    else:
        input = torch.ones(10)
        model = torch.nn.Linear(10, 20)

    # Float 32 act, float 32 weights
    pop_model = poptorch.inferenceModel(model)
    pop_out = pop_model(input)

    assert pop_out.dtype == torch.float

    # Float 16 act, float 32 weights
    pop_model = poptorch.inferenceModel(model)
    pop_out = pop_model(input.half())
    assert pop_out.dtype == torch.half

    # Float 32 act, float 16 weights
    model.half()
    pop_model = poptorch.inferenceModel(model)
    pop_out = pop_model(input)
    assert pop_out.dtype == torch.float

    # Float 16 act, float 16 weights
    pop_model = poptorch.inferenceModel(model)
    pop_out = pop_model(input.half())
    assert pop_out.dtype == torch.half


def test_master_weight_training():
    torch.manual_seed(42)

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.linear = torch.nn.Linear(10, 10)
            self.loss = torch.nn.MSELoss()

        def forward(self, data, target):
            out = self.linear(data)
            loss = self.loss(out, target)
            return out, loss

    model = Model()
    poptorch_model = poptorch.trainingModel(model)

    target = torch.randn(10)
    input = torch.randn(10).half()

    # Make sure the first run doesn't already pass the test.s
    original, original_loss = poptorch_model(input, target.half())
    assert original_loss > 0.1
    assert not torch.allclose(original.float(), target, rtol=1e-02, atol=1e-02)

    for _ in range(0, 2500):
        out, loss = poptorch_model(input, target.half())

    # Check we have trained the "model"
    assert loss.float() < 0.001
    helpers.assert_allclose(actual=out.float(),
                            expected=target,
                            rtol=1e-02,
                            atol=1e-02)


def test_bigger_model_training():
    torch.manual_seed(42)

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.linear_chain = torch.nn.Sequential(torch.nn.Linear(10, 10),
                                                    torch.nn.Linear(10, 10),
                                                    torch.nn.Linear(10, 10),
                                                    torch.nn.Linear(10, 10),
                                                    torch.nn.Linear(10, 10))
            self.loss = torch.nn.MSELoss()

        def forward(self, data, target):
            out = self.linear_chain(data)
            loss = self.loss(out, target)
            return out, loss

    model = Model()
    poptorch_model = poptorch.trainingModel(model)

    target = torch.randn(10)
    input = torch.randn(10).half()

    # Make sure the first run doesn't already pass the test.s
    original, original_loss = poptorch_model(input, target.half())
    assert original_loss > 0.1
    assert not torch.allclose(original.float(), target, rtol=1e-02, atol=1e-02)

    for _ in range(0, 2500):
        out, loss = poptorch_model(input, target.half())

    # Check we have trained the "model"
    assert loss.float() < 0.001
    helpers.assert_allclose(actual=out.float(),
                            expected=target,
                            rtol=1e-02,
                            atol=1e-02)


================================================
FILE: tests/half_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import os  # pylint: disable=unused-import
import unittest.mock
import torch
import torchvision.models as models
import helpers
import poptorch


def test_half_float_default_option():
    class SimpleAdder(torch.nn.Module):
        def forward(self, x, y):
            return x + y

    model = SimpleAdder()
    inference_model = poptorch.inferenceModel(model)

    t1 = torch.tensor([1.]).half()
    t2 = torch.tensor([2.]).float()

    outHalf = inference_model(t1, t2)
    assert outHalf.dtype == torch.float

    # Refresh and try the other way
    model = SimpleAdder()
    inference_model = poptorch.inferenceModel(model)

    outHalf = inference_model(t2, t1)
    assert outHalf.dtype == torch.float


@unittest.mock.patch.dict("os.environ", helpers.disableSmallModel())
def test_resnet():
    torch.manual_seed(42)

    image_input = torch.randn([1, 3, 224, 224]).half()
    t1 = torch.tensor([1.]).long()
    loss_fn = torch.nn.NLLLoss()

    class ModelWithLoss(torch.nn.Module):
        def __init__(self):
            super().__init__()
            # We are running on a dummy input so it doesn't matter whether the
            # weights are trained.
            self.base_model = models.resnet18(pretrained=False)

        def forward(self, data, target):
            out = self.base_model(data)
            loss = loss_fn(out, target)
            return out, loss

    model = ModelWithLoss()
    model.train()
    model.half()

    training_model = poptorch.trainingModel(model)

    # Run on IPU.
    poptorch_out, loss = training_model(image_input, t1)

    assert poptorch_out.dtype == torch.half
    assert loss.dtype == torch.half


def test_model_with_weights():
    model = torch.nn.Linear(1, 10).half()
    t1 = torch.tensor([1.]).half()

    inference_model = poptorch.inferenceModel(model)
    out = inference_model(t1)

    assert out.dtype == torch.half

    # For running on host.
    model = model.float()
    t1 = t1.float()

    helpers.assert_allclose(expected=model(t1),
                            actual=out.float(),
                            rtol=0.001,
                            atol=1e-04)


def test_simple_model():
    class SimpleAdder(torch.nn.Module):
        def forward(self, x, y, z, w):
            return x + y + 5, z + w + 5

    model = SimpleAdder()
    inference_model = poptorch.inferenceModel(model)

    t1 = torch.tensor([1.]).half()
    t2 = torch.tensor([2.]).half()

    t3 = torch.tensor([3.])
    t4 = torch.tensor([4.])

    outHalf, outFloat = inference_model(t1, t2, t3, t4)

    assert outHalf.dtype == torch.half
    assert outHalf.float() == 8.0

    assert outFloat.dtype == torch.float
    assert outFloat == 12.0


def test_lstm():
    torch.manual_seed(42)
    numHidden = 5
    inputSize = 3
    lstm = torch.nn.LSTM(3, numHidden)
    lstm.half()
    ipuLstm = poptorch.inferenceModel(lstm)
    inputs = [torch.randn(1, inputSize).half() for _ in range(5)]
    # Add the extra 2nd dimension
    inputs = torch.cat(inputs).view(len(inputs), 1, -1)
    hidden = (
        torch.randn(1, 1, numHidden).half(),
        torch.randn(1, 1, numHidden).half(),
    )
    ipuOut = ipuLstm(inputs, hidden)
    assert isinstance(ipuOut[0], torch.HalfTensor)


def test_ipu_print_tensor():
    class SimplePrinter(torch.nn.Module):
        def forward(self, x):
            return poptorch.ipu_print_tensor(x)

    t1 = torch.tensor([1.], dtype=torch.float16)
    inference_model = poptorch.inferenceModel(SimplePrinter())
    out = inference_model(t1)
    assert out == 1.0
    assert out.dtype == torch.float16


def test_buffers():
    torch.manual_seed(42)
    fake_data = torch.ones(1, 64, 10, 10).half()

    class M(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.bn = torch.nn.BatchNorm2d(64)

            self.bn.running_mean += torch.randn(64)
            self.bn.running_var += torch.randn(64)

        def forward(self, i):
            out = self.bn(i)
            return out, self.bn.running_var, self.bn.running_mean

    model = M()

    cpu_mean = model.bn.running_mean
    cpu_var = model.bn.running_var

    model.bn.half()
    model.bn.running_mean = model.bn.running_mean.to(torch.float)
    model.bn.running_var = model.bn.running_var.to(torch.float)

    poptorch_model = poptorch.inferenceModel(model)
    _, ipu_var, ipu_mean = poptorch_model(fake_data)

    # We lose some precision in the half conversion.
    helpers.assert_allclose(actual=ipu_mean,
                            expected=cpu_mean.half(),
                            rtol=1e-02,
                            atol=1e-02)

    helpers.assert_allclose(actual=ipu_var,
                            expected=cpu_var.half(),
                            rtol=1e-02,
                            atol=1e-02)


def test_half_casts_outplace():
    torch.manual_seed(42)
    opts = poptorch.Options()

    class Model(torch.nn.Module):
        def forward(self, x1, x2):
            return x1, x2, x1.to(torch.float16), x2.half()

    model = Model()
    poptorch_model = poptorch.inferenceModel(model, opts)

    x1 = torch.tensor([0], dtype=torch.float32)
    x2 = torch.tensor([0], dtype=torch.float32)

    x1_ipu, x2_ipu, x1_cast, x2_cast = poptorch_model(x1, x2)
    assert x1_ipu.dtype == torch.float32
    assert x2_ipu.dtype == torch.float32
    assert x1_cast.dtype == torch.float16
    assert x2_cast.dtype == torch.float16


def test_8bit_io_casting():
    torch.manual_seed(42)

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.linear = torch.nn.Linear(1, 1)

        def forward(self, x):
            x1 = self.linear(x.half())
            x2 = self.linear(x.to(torch.half))
            x3 = self.linear(x.float())
            x4 = self.linear(x.to(torch.float))
            return x1, x2, x3, x4

    model = Model()
    poptorch_model = poptorch.inferenceModel(model)

    x = torch.tensor([0], dtype=torch.uint8)

    y = poptorch_model(x)
    assert y[0].dtype == torch.half
    assert y[1].dtype == torch.half
    assert y[2].dtype == torch.float
    assert y[3].dtype == torch.float


def test_buffers_without_parameters_can_be_traced():
    torch.manual_seed(0)

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.register_buffer("b", torch.randn(3, 3))

        def forward(self, x):
            return torch.matmul(self.b, x)

    model = Model()
    model.half()
    poptorch_model = poptorch.inferenceModel(model)
    poptorch_model(torch.randn(3, 3).half())


================================================
FILE: tests/helpers.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import functools
import os
import re
import torch
import poptorch
import poptorch.poptorch_core as poptorch_core  # type: ignore

# Will be changed by conftest.py if pytest is only collecting tests
is_running_tests = True
# Will be changed by conftest.py if pytest is running reduced testing
running_reduced_testing = False


def selectIfReduced(reduced_set, full_set):
    if running_reduced_testing:
        return reduced_set
    return full_set


def onlyFirstIfReduced(full_set):
    if running_reduced_testing:
        return (full_set[0], )
    return full_set


def assert_allclose(*,
                    actual=None,
                    expected=None,
                    check_dtype=False,
                    atol=None,
                    rtol=None,
                    **kwargs):
    """Assertion function that enforces passing the 'actual' and 'expected'
    arguments to torch.testing.assert_close in the correct order by forcing
    the use of keyword arguments. This improves error reporting in case of
    assertion failures.

    :param actual: torch.Tensor, scalar value, or array-like of either
            torch.Tensor objects or scalar values that is tested.
    :param expected: torch.Tensor, scalar value, or array-like of either
            torch.Tensor objects or scalar values that is tested.
    :param check_dtype: whether to check the types of the tensor
    :param kwargs: kwargs passed to torch.testing.assert_close.
    """
    assert actual is not None and expected is not None, (
        "'actual' and 'expected' keyword arguments must be present")

    in_types = (type(actual), type(expected))
    if in_types == (torch.Tensor, torch.Tensor):
        assert actual.shape == expected.shape, (
            "Shape of 'actual' (%s) should be the same as shape of"
            " 'expected' (%s)") % (actual.shape, expected.shape)
    elif in_types in ((list, list), (tuple, tuple)):
        assert len(actual) == len(expected), (
            "Length of 'actual' (%s) should be the same as length of"
            " 'expected' (%s)") % (len(actual), len(expected))
        for a, e in zip(actual, expected):
            assert_allclose(actual=a, expected=e, **kwargs)
        return

    if not isinstance(actual, torch.Tensor):
        actual = torch.tensor(actual)
    if not isinstance(expected, torch.Tensor):
        expected = torch.tensor(expected)

    if atol is None and expected.dtype == torch.float16:
        atol = 5e-4
    if rtol is None and expected.dtype == torch.float16:
        rtol = 5e-3

    torch.testing.assert_close(actual,
                               expected,
                               atol=atol,
                               rtol=rtol,
                               check_dtype=check_dtype,
                               **kwargs)


def assert_allequal(*,
                    actual=None,
                    expected=None,
                    msg='',
                    check_dtype=False,
                    **kwargs):
    """Assertion function that enforces passing the 'actual' and 'expected'
    arguments to torch.testing.assert_close in the correct order by forcing
    the use of keyword arguments. This improves error reporting in case of
    assertion failures. Additionally, rtol=0 and atol=0 are passed to
    torch.testing.assert_close as this results in identity comparison for
    integer and boolean tensors.

    :param actual: torch.Tensor, scalar value, or array-like of either
            torch.Tensor objects or scalar values that is tested.
    :param expected: torch.Tensor, scalar value, or array-like of either
            torch.Tensor objects or scalar values that is tested.
    :param msg: message passed to torch.testing.assert_close.
    :param check_dtype: whether to check the types of the tensor
    :param kwargs: kwargs passed to torch.testing.assert_close.
    """
    assert actual is not None and expected is not None, (
        "'actual' and 'expected' keyword arguments must be present")

    if isinstance(actual, torch.Tensor) and isinstance(expected, torch.Tensor):
        assert actual.shape == expected.shape, (
            "Shape of 'actual' (%s) should be the same as shape of"
            " 'expected' (%s)") % (actual.shape, expected.shape)

    torch.testing.assert_close(actual,
                               expected,
                               rtol=0,
                               atol=0,
                               msg=msg,
                               check_dtype=check_dtype,
                               **kwargs)


def disableSmallModel():
    # POPTORCH_IPU_MODEL takes precedence over POPTORCH_SMALL_IPU_MODEL
    if not poptorch.ipuHardwareIsAvailable():
        return {"POPTORCH_IPU_MODEL": "1"}
    return {}


def forceSmallModel():
    # POPTORCH_IPU_MODEL takes precedence over POPTORCH_SMALL_IPU_MODEL
    return {"POPTORCH_IPU_MODEL": "0", "POPTORCH_SMALL_IPU_MODEL": "1"}


def disableAllModels():
    return {"POPTORCH_IPU_MODEL": "0", "POPTORCH_SMALL_IPU_MODEL": "0"}


def propagateInputShapes(graph, dummyInputs):
    for graphInput, dummyInput in zip(graph.inputs(), dummyInputs):
        graphInput.inferTypeFrom(dummyInput)
    poptorch_core.propagateInputShapes(graph)


# Wrapper model with weights to test that gradients are generated
# and updated in a graph with a given op - Linear layer added to
# ensure some weights exist
class ModelWithWeights(torch.nn.Module):
    def __init__(self, op, first_input_shape, out_fn=None, loss_fn=None):
        super().__init__()
        self.op = op
        numel = first_input_shape.numel()
        self.first_input_shape = first_input_shape
        self.lin = torch.nn.Linear(numel, numel)
        # Copy original weights for training test
        self._weights_before = self.lin.weight.detach().clone()
        # A function of the output that returns what the backwards pass should
        # propagate through. For example, torch.median returns values and indices
        # but the loss should only be calculated using the values. If unspecified,
        # defaults to an identity function
        self.out_fn = out_fn
        # If the loss fn takes more than 1 param (e.g. a target), these extra params
        # must be wrapped in a function that only takes a single input
        self.loss_fn = loss_fn if not loss_fn is None \
            else lambda x: poptorch.identity_loss(x**2, reduction='sum')

    # Flatten first input, pass through linear layer of same size
    # and pass reassembled inputs to op
    def forward(self, xs):
        assert isinstance(xs, tuple)
        x1 = torch.flatten(xs[0])
        x1 = self.lin(x1)
        x1 = x1.reshape(self.first_input_shape)
        x = self.op(x1, *xs[1:])
        loss_in = x if self.out_fn is None else self.out_fn(x)
        if isinstance(loss_in, tuple):
            l = self.loss_fn(*loss_in)
        else:
            l = self.loss_fn(loss_in)
        return x, l

    def assert_weights_changed(self):
        weights_after = self.lin.weight.detach().clone()
        assert not torch.allclose(self._weights_before, weights_after)


class PrintCapfdOnExit:
    """Helper that prints the content of capfd on exit

    Useful if a test fails before its output validation step."""

    def __init__(self, capfd):
        self.capfd = capfd

    def __enter__(self):
        pass

    def __exit__(self, type, value, traceback):
        out, err = self.capfd.readouterr()
        log = out + err
        with self.capfd.disabled():
            if log:
                print(log.encode("ascii", "ignore").decode())


def printCapfdOnExit(func):
    """Decorator to print the content of capfd after the wrapped function
    exits."""

    @functools.wraps(func)
    def wrapper(capfd, *args, **kwargs):
        with PrintCapfdOnExit(capfd):
            func(*args, **kwargs, capfd=capfd)

    return wrapper


def overridePoptorchLogLevel(level=None):
    """Decorator to override the PopTorch log level for the duration of the test"""

    def decorator(func):
        @functools.wraps(func)
        def wrapper(*args, **kwargs):
            if level is not None:
                poptorch.setLogLevel(level)
            func(*args, **kwargs)
            poptorch.setLogLevel(os.environ.get("POPTORCH_LOG_LEVEL", "WARN"))

        return wrapper

    return decorator


def overridePopartLogLevel(level=None):
    """Decorator to override the Popart log level for the duration of the test"""

    def decorator(func):
        @functools.wraps(func)
        def wrapper(*args, **kwargs):
            if level is not None:
                poptorch._logging.setPopartLogLevel(level)  # pylint: disable=protected-access
            func(*args, **kwargs)
            poptorch._logging.setPopartLogLevel(  # pylint: disable=protected-access
                os.environ.get("POPART_LOG_LEVEL", "WARN"))

        return wrapper

    return decorator


class LogIterator:
    def __init__(self, lines):
        self._lines = lines
        self._current = 0
        self._num_lines = len(lines)
        self._all_checks = []

    def lineNumber(self):
        return self._current

    def findNext(self, *exprs):
        """Find the next line in the log matching all the regular expressions provided"""
        self._all_checks.append(exprs)
        line = self._findNext(exprs)
        assert line is not None, (
            "\n".join(self._lines) +
            "\n The log above doesn't contain lines matching all "
            "these expressions:\n  " +
            "\n  ".join(str(e) for e in self._all_checks))
        return line

    def _findNext(self, exprs):
        while self._current < self._num_lines:
            line = self._lines[self._current]
            self._current += 1
            if all(re.search(e, line) for e in exprs):
                return line
        return None

    def assert_not_contains(self, *exprs):
        line = self._findNext(exprs)
        if line is not None:
            raise ValueError(
                f"{line}"
                "\n The line above matches all of the expressions "
                f"{exprs}")

    def findAll(self, expr):
        """Return all lines in the log matching the provided regular expression"""
        matching_lines = []
        for line in self._lines:
            match = re.search(expr, line)
            if match is not None:
                matching_lines.append(match)
        return matching_lines


class LogChecker:
    def __init__(self, capfd_or_str):
        if isinstance(capfd_or_str, str):
            self._log = capfd_or_str
        elif hasattr(capfd_or_str, "text"):
            self._log = capfd_or_str.text
        elif hasattr(capfd_or_str, "readouterr"):
            out, err = capfd_or_str.readouterr()
            self._log = out + err
        else:
            raise TypeError("LogChecker passed unsupported capture fixture")
        self._lines = self._log.split('\n')

    def createIterator(self):
        return LogIterator(self._lines)

    def assert_isEmpty(self):
        assert not self._log, f"Expected an empty log but got {self._log}"

    def assert_contains(self, *strings):
        """Assert there is a line in the log matching all the strings provided
        """
        if len(strings) == 1:
            assert strings[0] in self._log, (f"{self._log}"
                                             "\ndoes not contain "
                                             f"'{strings[0]}'")
        else:
            assert any(
                all(s in line for s in strings) for line in self._lines), (
                    f"{self._log}"
                    "\n No line in the above log contains all of the strings "
                    f"{strings}")

    def assert_contains_after(self, string, after):
        """Assert there is a line in the log matching the string provided, at
           least one after the the line containing the other provided string"""
        after_hit = False
        for line in self._lines:
            if after_hit:
                if string in line:
                    return
            elif after in line:
                after_hit = True

        raise AssertionError(f"Did not contain {string} after {after}")

    def assert_not_contains(self, *strings):
        """Assert there is no line in the log matching all the strings provided
        """
        if len(strings) == 1:
            assert strings[0] not in self._log, (f"{self._log}"
                                                 "\ncontains "
                                                 f"'{strings[0]}'")
        else:
            for line in self._lines:
                if all(s in line for s in strings):
                    # Found a line matching all the strings
                    raise ValueError(
                        f"{line}"
                        "\n The line above matches all of the strings "
                        f"{strings}")

    def _string_matches_exprs(self, s, exprs):
        return all(re.search(e, s) for e in exprs)

    def assert_matches(self, *exprs, per_line=True):
        """Assert the log matches all the regular expressions provided
        """
        if per_line:
            # Found a line matching all the exprs
            if any(
                    self._string_matches_exprs(line, exprs)
                    for line in self._lines):
                return
        else:
            # Search the entire log at once
            if self._string_matches_exprs(self._log, exprs):
                return

        any_line_in = "any line in " if per_line else ""
        raise ValueError(
            f"{self._log}"
            f"\n All of the expressions do not match {any_line_in}"
            f"the log {exprs}")

    def assert_no_matches(self, *exprs, per_line=True):
        """Assert the log does not match all the regular expressions provided"""
        if per_line:
            for line in self._lines:
                if self._string_matches_exprs(line, exprs):
                    # Found a line matching all the exprs
                    raise ValueError(
                        f"{line}"
                        "\n The line above matches all of the expressions "
                        f"{exprs}")
        else:
            if self._string_matches_exprs(self._log, exprs):
                # The log matches all the exprs
                raise ValueError(
                    f"{self._log}"
                    "\n The log above matches all of the expressions "
                    f"{exprs}")

    def findall(self, pattern: str) -> list:
        return re.findall(pattern, self._log)


# When we're running on the CPU we don't need to specify a device
# but for IPU devices we need to make sure the output buffers are
# created on the IPU.
def outputDevice():
    if poptorch.isRunningOnIpu() and poptorch._impl.isDispatchTracing():  # pylint: disable=protected-access
        return "ipu"
    return None


================================================
FILE: tests/hooks_test.py
================================================
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import pytest
import torch
import torch.nn as nn

from poptorch import (inferenceModel, registerPostCompileHook,
                      registerPreCompileHook)


class Model(nn.Module):
    def forward(self, input):
        return input


def test_precompile_and_postcompile_hooks():
    """Test that registered pre and post compile hooks are called."""
    model = Model()

    precompile_called = False
    postcompile_called = False

    def precompile():
        nonlocal precompile_called
        precompile_called = True

    def postcompile():
        nonlocal postcompile_called
        postcompile_called = True

    registerPreCompileHook(precompile)
    registerPostCompileHook(postcompile)

    poplar_exec = inferenceModel(model)
    input = torch.randn((10, 10), dtype=torch.float32)
    poplar_exec(input)
    assert precompile_called and postcompile_called


def test_non_callable():
    """Test that an error is raised if a non-callable
    is attempted to be registered"""
    with pytest.raises(RuntimeError, match="must be callable"):
        registerPreCompileHook(2)

    with pytest.raises(RuntimeError, match="must be callable"):
        registerPostCompileHook(False)


def test_called_in_order():
    """Test that hooks are called in the order they were registered in."""
    expected_calls = [1, 2, 3]
    calls = []

    def hookO():
        nonlocal calls
        calls.append(expected_calls[0])

    def hook1():
        nonlocal calls
        calls.append(expected_calls[1])

    def hook2():
        nonlocal calls
        calls.append(expected_calls[2])

    registerPreCompileHook(hookO)
    registerPreCompileHook(hook1)
    registerPreCompileHook(hook2)

    model = Model()
    poplar_exec = inferenceModel(model)
    input = torch.randn((10, 10), dtype=torch.float32)
    poplar_exec(input)

    assert calls == expected_calls


def test_can_remove():
    """Test that a hook is correctly removed via Torch's RemovableHandle."""
    called = False

    def hook():
        nonlocal called
        called = True

    handle = registerPostCompileHook(hook)
    handle.remove()

    model = Model()
    poplar_exec = inferenceModel(model)
    input = torch.randn((10, 10), dtype=torch.float32)
    poplar_exec(input)

    assert not called


================================================
FILE: tests/if_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

from functools import lru_cache
import torch
import pytest
from helpers import assert_allclose
import poptorch


@lru_cache(maxsize=None)
def infer_model(model):
    return poptorch.inferenceModel(model)


def if_else_harness(model, expected_then, expected_else, *args):
    inference_model = infer_model(model)

    condition = torch.tensor([True])
    ipu_result = inference_model(condition, *args)
    cpu_result = model(condition, *args)
    assert_allclose(expected=expected_then, actual=cpu_result)
    assert_allclose(expected=expected_then, actual=ipu_result)

    condition = torch.tensor([False])
    ipu_result = inference_model(condition, *args)
    cpu_result = model(condition, *args)
    assert_allclose(expected=expected_else, actual=cpu_result)
    assert_allclose(expected=expected_else, actual=ipu_result)


@pytest.mark.skip(
    reason="Returning constant from model does not work in poptorch (AFS-251)")
def test_constants():
    class Model(torch.nn.Module):
        def forward(self, condition):
            def body_then():
                return torch.tensor([0])

            def body_else():
                return torch.tensor([1])

            return poptorch.cond(condition, body_then, [], body_else, [])[0]

    args = [torch.tensor([v]) for v in range(2)]
    if_else_harness(Model(), args[0], args[1])


@pytest.mark.skip(
    reason="Returning constant from model does not work in poptorch (AFS-251)")
def test_operations_on_constants():
    constants = [[1., 2.], [3., 4.]]

    class Model(torch.nn.Module):
        def forward(self, condition):
            x = torch.tensor(constants[0])
            y = torch.tensor(constants[1])

            def body_then(a, b):
                a = a * 2
                b = a * b
                return b

            def body_else(a, b):
                a = a - 2
                b = b + a
                return b

            return poptorch.cond(condition, body_then, [x, y], body_else,
                                 [x, y])[0]

    args = []
    exp_then = torch.tensor(
        [a * 2 * b for a, b in zip(constants[0], constants[1])])
    exp_else = torch.tensor(
        [a - 2 + b for a, b in zip(constants[0], constants[1])])
    if_else_harness(Model(), exp_then, exp_else, *args)


@pytest.mark.skip(reason="Inplace op does not update model input (AFS-252)")
def test_inplace_op():
    class Model(torch.nn.Module):
        def forward(self, condition, x, y):
            def body_then(a):
                return a.add_(a)

            def body_else(b):
                return b

            return poptorch.cond(condition, body_then, [x], body_else, [y])[0]

    or_x = 1.
    x = torch.tensor([or_x])
    y = torch.tensor([10.])
    exp_then = x + y
    exp_else = y
    if_else_harness(Model(), exp_then, exp_else, x, y)
    assert torch.tensor([or_x]) == x


def test_operation_expecting_constant():
    constant = [1.1, 2.3]

    class Model(torch.nn.Module):
        def forward(self, condition, z):
            x = torch.tensor(constant)

            def body_then(a, b):
                b = a * torch.topk(b, 2)[0]
                return b

            def body_else(a, b):
                a = a - 2
                b = b[:2] + a
                return b

            return poptorch.cond(condition, body_then, [x, z], body_else,
                                 [x, z])[0]

    arg = torch.rand(4)
    exp_then = torch.topk(torch.tensor(arg), 2)[0] * torch.tensor(constant)
    exp_else = torch.tensor(constant) - 2 + arg[:2]
    if_else_harness(Model(), exp_then, exp_else, arg)


def test_body_args():
    class Model(torch.nn.Module):
        def forward(self, condition, x, y):
            def body_then(a):
                out = a + a
                out = out + out
                return out

            def body_else(b):
                return b

            return poptorch.cond(condition, body_then, [x], body_else, [y])[0]

    args = [torch.rand(1) for _ in range(2)]
    if_else_harness(Model(), args[0] * 4, args[1], *args)


def test_cond_training():
    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.layer1 = torch.nn.Linear(4, 4)

        def forward(self, condition, x):
            def body(x):
                return self.layer1(x)

            out = poptorch.cond(condition, body, [x], body, [x])[0]
            loss = poptorch.identity_loss(out, reduction='sum')
            return out, loss

    training_model = poptorch.trainingModel(Model())

    condition = torch.tensor([True])
    x = torch.ones(1, 4).to(torch.float)
    with pytest.raises(
            poptorch.Error,
            match=r"poptorch.cond\(\) is only supported in inference"):
        training_model(condition, x)


def test_multi_outs():
    class Model(torch.nn.Module):
        def forward(self, condition, x, y):
            def body_then(a):
                out1 = x + y
                return a, out1, y

            def body_else(b):
                return b * y, y, x - y

            return poptorch.cond(condition, body_then, [x], body_else, [y])

    args = [torch.rand(1) for _ in range(2)]
    exp_then = [args[0], args[0] + args[1], args[1]]
    exp_else = [args[1] * args[1], args[1], args[0] - args[1]]
    if_else_harness(Model(), exp_then, exp_else, *args)


def test_diff_num_of_args():
    class Model(torch.nn.Module):
        def forward(self, condition, x, y):
            def body_then(x, y):
                return x + y

            def body_else(x):
                return x

            return poptorch.cond(condition, body_then, [x, y], body_else,
                                 [x])[0]

    args = [torch.rand(1) for v in range(2)]
    exp_then = args[0] + args[1]
    if_else_harness(Model(), exp_then, args[0], *args)


def test_args_from_main_graph():
    class Model(torch.nn.Module):
        def forward(self, condition, x, y):
            def body_then():
                return x * y

            def body_else():
                return x

            return poptorch.cond(condition, body_then, [], body_else, [])[0]

    args = [torch.rand(1) for v in range(2)]
    exp_then = args[0] * args[1]
    if_else_harness(Model(), exp_then, args[0], *args)


def test_call_outer_body():
    class Model(torch.nn.Module):
        def forward(self, condition, x, y):
            def outer_body():
                return x + y

            def body_then():
                return outer_body()

            def body_else():
                return x

            return poptorch.cond(condition, body_then, [], body_else, [])[0]

    args = [torch.rand(1) for v in range(2)]
    exp_then = args[0] + args[1]
    if_else_harness(Model(), exp_then, args[0], *args)


def test_args_internal():
    internal_inps = [[10., -10.], [0., -2.]]

    class Model(torch.nn.Module):
        def forward(self, *args):
            condition = args[0]
            x = args[1]

            def body_then(a, b):
                return x + a + b

            def body_else(a):
                return a + x

            in1 = torch.tensor(internal_inps[0])
            return poptorch.cond(condition, body_then,
                                 [in1, torch.tensor(internal_inps[1])],
                                 body_else, [in1])[0]

    input_val = [5., -1.]
    args = [torch.tensor(input_val)]
    exp_then = args[0] + torch.tensor(internal_inps[0]) + torch.tensor(
        internal_inps[1])
    exp_else = torch.tensor(internal_inps[0]) + args[0]
    if_else_harness(Model(), exp_then, exp_else, *args)


def test_single_body():
    class Model(torch.nn.Module):
        def forward(self, condition, x, y):
            def body(a, b):
                return a + b

            return poptorch.cond(condition, body, [x, y], body, [x, x])[0]

    args = [torch.rand(1) for _ in range(2)]
    exp_then = torch.tensor(args[0] + args[1])
    exp_else = torch.tensor(args[0] + args[0])
    if_else_harness(Model(), exp_then, exp_else, *args)


def test_nested_cond():
    class Model(torch.nn.Module):
        def forward(self, condition, cond_nested, x, y):
            def body_then():
                def nested_then(x, y):
                    return x + y

                def nested_else():
                    return x - y

                return poptorch.cond(cond_nested, nested_then, [x, y],
                                     nested_else, [])[0]

            def body_else(cond_nested):
                cond_nested = torch.logical_not(cond_nested)

                def nested_then(y):
                    return x * y

                def nested_else():
                    return x * 2

                return poptorch.cond(cond_nested, nested_then, [y],
                                     nested_else, [])[0]

            res1 = poptorch.cond(condition, body_then, [], body_else,
                                 [cond_nested])[0]
            re2 = poptorch.cond(condition, body_then, [], body_else,
                                [cond_nested])[0]
            return res1 + re2

    model = Model()
    cond_nested = torch.tensor([True])
    args = [cond_nested] + [torch.rand(1) for v in range(2)]
    exp_then = 2 * (args[1] + args[2])
    exp_else = 2 * args[1] * 2
    if_else_harness(model, exp_then, exp_else, *args)

    cond_nested = torch.tensor([False])
    args = [cond_nested] + [torch.rand(1) for v in range(2)]
    exp_then = 2 * (args[1] - args[2])
    exp_else = 2 * (args[1] * args[2])
    if_else_harness(model, exp_then, exp_else, *args)


@pytest.mark.parametrize(
    ("execution_strategy"),
    [
        poptorch.ShardedExecution,
        poptorch.ParallelPhasedExecution,
        poptorch.SerialPhasedExecution,
    ],
)
def test_if_on_multiple_ipus(execution_strategy):
    class Model(torch.nn.Module):
        def forward(self, condition, x, y):
            def body_then(x, y):
                return x + y, y

            def body_else(x, y):
                return x, x * y

            with poptorch.Block("0", ipu_id=0):
                x, y = poptorch.cond(condition, body_then, [x, y], body_else,
                                     [x, y])

            with poptorch.Block("1", ipu_id=1):
                x, y = poptorch.cond(torch.logical_not(condition), body_then,
                                     [x, y], body_else, [x, y])
            return x, y

    stages = [poptorch.Stage(f"{k}") for k in range(0, 2)]
    strategy = execution_strategy(*stages)

    opts = poptorch.Options()
    opts.autoRoundNumIPUs(True)
    opts.setExecutionStrategy(strategy)
    ipu_model = poptorch.inferenceModel(Model(), opts)

    x = torch.tensor([1., 2.])
    y = torch.tensor([3., 4.])

    condition = torch.tensor([True])
    ipu_res = ipu_model(condition, x, y)
    exp_res = (x + y, (x + y) * y)
    for a, b in zip(ipu_res, exp_res):
        assert all(a == b)


================================================
FILE: tests/index_ops_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.

import torch
import pytest

import helpers
import poptorch


def index_op0(t, idx, v=None):
    if v is None:
        return t[idx]
    t[idx] = v
    return t


def index_op1(t, idx, v=None):
    if v is None:
        return t[idx, idx]
    t[idx, idx] = v
    return t


def index_op2(t, idx, v=None):
    if v is None:
        return t[:, idx]
    t[:, idx] = v
    return t


def index_op3(t, idx, v=None):
    if v is None:
        return t[idx, :, idx]
    t[idx, :, idx] = v
    return t


def index_op4(t, idx, v=None):
    if v is None:
        return t[:, :, idx]
    t[:, :, idx] = v
    return t


def index_op5(t, idx, v=None):
    if v is None:
        return t[:, idx, idx]
    t[:, idx, idx] = v
    return t


def index_op6(t, idx, v=None):
    if v is None:
        return t[idx, idx, idx, idx]
    t[idx, idx, idx, idx] = v
    return t


def index_op7(t, idx, v=None):
    if v is None:
        return t[:, :, :, idx]
    t[:, :, :, idx] = v
    return t


def index_op8(t, idx, v=None):
    if v is None:
        return t[:, idx, :, idx]
    t[:, idx, :, idx] = v
    return t


def index_harness(op, idx, is_index_put, v=None, is_mask=False):
    torch.manual_seed(42)
    t = torch.randn(2, 3, 4, 5)
    if not is_mask:
        idx_tensor = torch.tensor(idx)
    else:
        idx_tensor = idx
    model = helpers.ModelWithWeights(op, t.shape)
    # The LR should be large enough to guarantee weights change
    optim = torch.optim.AdamW(model.parameters(), lr=0.1)
    poptorch_model = poptorch.trainingModel(model, optimizer=optim)

    if is_index_put:
        if v is None:
            v = torch.zeros_like(op(t, idx_tensor))
        # Clone the tensor so that the original is unchanged by the in-place op
        native_out, _ = model((t.clone(), idx_tensor, v))
        poptorch_out, _ = poptorch_model((t, idx_tensor, v))
    else:
        native_out, _ = model((t, idx_tensor))
        poptorch_out, _ = poptorch_model((t, idx_tensor))

    # Inference test - check outputs
    helpers.assert_allclose(actual=poptorch_out, expected=native_out)

    # Training test - check weights changed
    poptorch_model.assert_weights_changed()


index_ops = [
    index_op0,
    index_op1,
    index_op2,
    index_op3,
    index_op4,
    index_op5,
    index_op6,
    index_op7,
    index_op8,
]

index_indices = ([0], [[1]], [0, 1], [[1, 0]], [[0, 1], [1, 0]])


@pytest.mark.parametrize("idxs", index_indices)
@pytest.mark.parametrize("op", index_ops)
def test_index(op, idxs):
    index_harness(op, idxs, False)


def test_index_bool_mask_failure():
    with pytest.raises(
            torch._subclasses.fake_tensor.DynamicOutputShapeException):  # pylint: disable=protected-access
        index_harness(index_ops[0], [True, False], False)


def test_index_on_max_indices():
    def op(x):
        _, argmax_tensor = torch.max(x, dim=1)
        b = x[:, argmax_tensor]
        return b, argmax_tensor

    inp_tensor = torch.rand(1, 10, 2)

    model = helpers.ModelWithWeights(op, inp_tensor.shape, lambda x: x[0])
    poptorch_model = poptorch.trainingModel(model)

    native_out, _ = model((inp_tensor, ))
    poptorch_out, _ = poptorch_model((inp_tensor, ))

    # Inference test - check outputs
    for native, pop in zip(native_out, poptorch_out):
        helpers.assert_allclose(actual=pop, expected=native)

    # Training test - check weights changed
    poptorch_model.assert_weights_changed()


@pytest.mark.parametrize("idxs", index_indices)
@pytest.mark.parametrize("op", index_ops)
def test_index_put(op, idxs):
    index_harness(op, idxs, True)


def test_index_put_scalar():
    def op(t, idx, v):
        t[idx, idx] = v
        return t

    # For each element e in t[0, 0], e = 0
    index_harness(op, [[0]], True, 0)


def test_index_put_broadcastable():
    v = torch.zeros(5)
    # For each row r in t[0, 0], r = [0, 0, 0, 0, 0]
    index_harness(index_op1, [[0]], True, v)


@pytest.mark.parametrize("mask_size, dtype", [
    (1, torch.bool),
    (2, torch.uint8),
    (3, torch.bool),
    (4, torch.uint8),
])
def test_index_put_masked_fill(mask_size, dtype):
    torch.manual_seed(42)
    mask_shape = [2, 3, 4, 5][:mask_size]
    mask = (torch.rand(mask_shape) > 0.5).type(dtype)
    v = torch.tensor([0.])
    index_harness(index_op0, mask, True, v=v, is_mask=True)


@pytest.mark.parametrize("mask_size, dtype", [
    (1, torch.bool),
    (2, torch.uint8),
    (3, torch.bool),
    (4, torch.uint8),
])
def test_index_put_masked_assign(mask_size, dtype):
    torch.manual_seed(42)
    mask_shape = [2, 3, 4, 5][:mask_size]
    mask = (torch.rand(mask_shape) > 0.5).type(dtype)
    v = torch.zeros([2, 3, 4, 5][mask_size:], dtype=torch.float32)
    if len(v.size()) == 0:
        # To avoid a size 0 tensor
        v = v.unsqueeze(0)
    index_harness(index_op0, mask, True, v=v, is_mask=True)


def get_index_fill_fn(dim):
    def index_fill(t, idx, v):
        t.index_fill_(dim, idx, v)
        return t

    return index_fill


@pytest.mark.parametrize("value", (-1, torch.tensor(-1)))
@pytest.mark.parametrize("dim", [1, 2, 3])
def test_index_fill(value, dim):
    torch.manual_seed(42)
    op = get_index_fill_fn(dim)
    index_harness(op, [0, 2], True, value)


@pytest.mark.parametrize("dim", range(-3, 3))
def test_index_select(dim):
    op = lambda src, index: src.index_select(dim, index)

    torch.manual_seed(0)
    x = torch.randn(2, 4, 8)
    sz = x.shape[dim]
    indices = torch.randint(sz, (sz, ))

    model = helpers.ModelWithWeights(op, x.shape)
    native_out, _ = model((x, indices))

    poptorch_model = poptorch.trainingModel(model)
    poptorch_out, _ = poptorch_model((x, indices))

    # Inference test - check outputs
    helpers.assert_allclose(actual=poptorch_out, expected=native_out)

    # Training test - check weights changed
    poptorch_model.assert_weights_changed()


@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("TRACE")
@pytest.mark.parametrize("dim", [0, 1])
def test_vectorized_scatter(capfd, dim):
    def op(out, index, src):
        if dim == 0:
            out[index, :] = src
        else:
            out[:, index] = src

        return out

    torch.manual_seed(0)
    N = 20
    out = torch.randn(N, 30)
    sz = out.shape[dim] - N // 10
    indices = torch.randint(sz, (sz, ))
    src_sz = (sz, out.shape[1]) if dim == 0 else (out.shape[0], sz)
    src = torch.randn(src_sz)

    model = helpers.ModelWithWeights(op, out.shape)
    # Clone the tensor so that the original is unchanged by the in-place op
    native_out, _ = model((out.clone(), indices, src))

    poptorch_model = poptorch.trainingModel(model)
    poptorch_out, _ = poptorch_model((out.clone(), indices, src))

    # Inference test - check outputs
    helpers.assert_allclose(actual=poptorch_out, expected=native_out)

    # Training test - check weights changed
    poptorch_model.assert_weights_changed()

    it = helpers.LogChecker(capfd).createIterator()
    it.findNext("Using vectorized ScatterReduce with none reduction")


================================================
FILE: tests/inplace_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.

import re
import torch
import torch.nn as nn
import pytest
import helpers
import poptorch


def test_inplace_add():
    class Model(nn.Module):
        def forward(self, x):
            if isinstance(x, (tuple, list)):
                x[0] += 4
            elif isinstance(x, (dict)):
                x['input'] += 3
            else:
                x += 1

    poptorch_model = poptorch.inferenceModel(Model())
    tensor_in = torch.Tensor([1.0])
    assert poptorch_model(tensor_in) is None
    assert tensor_in == 2.0
    assert poptorch_model(tensor_in) is None
    assert tensor_in == 3.0
    assert poptorch_model(torch.Tensor([1.0])) is None
    assert tensor_in == 3.0

    # We're changing the input type: must recompile
    poptorch_model.destroy()
    list_in = [torch.Tensor([1.0])]
    cpu_in = [torch.Tensor([1.0])]
    model = Model()
    for i in range(2):
        print(f"Run {i}")
        cpu_out = model(cpu_in)
        poptorch_out = poptorch_model(list_in)
        assert cpu_out == poptorch_out
        assert list_in == cpu_in


def test_inplace_add_multi_elements():
    class Model(nn.Module):
        def forward(self, _x, y):
            y += 1

    poptorch_model = poptorch.inferenceModel(Model())
    nested_tuple_in = ((torch.Tensor([1.0]), torch.Tensor([1.0])),
                       (torch.Tensor([1.0])))
    tensor_in = torch.Tensor([1.0])

    assert poptorch_model(nested_tuple_in, tensor_in) is None
    assert tensor_in == 2.0


def test_inplace_sub():
    class Model(nn.Module):
        def forward(self, x):
            if isinstance(x, (tuple, list)):
                x[0] -= 3
            elif isinstance(x, (dict)):
                x['input'] -= 2
            else:
                x -= 1

    poptorch_model = poptorch.inferenceModel(Model())
    tensor_in = torch.Tensor([1.0])
    assert poptorch_model(tensor_in) is None
    assert tensor_in == 0.0
    assert poptorch_model(tensor_in) is None
    assert tensor_in == -1.0
    assert poptorch_model(torch.Tensor([1.0])) is None
    assert tensor_in == -1.0

    # We're changing the  input type: must recompile
    poptorch_model.destroy()
    list_in = [torch.Tensor([1.0])]
    cpu_in = [torch.Tensor([1.0])]
    model = Model()
    for i in range(2):
        print(f"Run {i}")
        cpu_out = model(cpu_in)
        poptorch_out = poptorch_model(list_in)
        assert cpu_out == poptorch_out
        assert list_in == cpu_in


def test_inplace_div():
    class Model(nn.Module):
        def forward(self, x):
            if isinstance(x, (tuple, list)):
                x[0] /= 4
            elif isinstance(x, (dict)):
                x['input'] /= 3
            else:
                x /= 2

    poptorch_model = poptorch.inferenceModel(Model())
    tensor_in = torch.Tensor([1.0])
    assert poptorch_model(tensor_in) is None
    assert tensor_in == 0.5
    assert poptorch_model(tensor_in) is None
    assert tensor_in == 0.25
    assert poptorch_model(torch.Tensor([1.0])) is None
    assert tensor_in == 0.25

    # We're changing the  input type: must recompile
    poptorch_model.destroy()
    list_in = [torch.Tensor([1.0])]
    cpu_in = [torch.Tensor([1.0])]
    model = Model()
    for i in range(2):
        print(f"Run {i}")
        cpu_out = model(cpu_in)
        poptorch_out = poptorch_model(list_in)
        assert cpu_out == poptorch_out
        assert list_in == cpu_in


def test_inplace_mul():
    class Model(nn.Module):
        def forward(self, x):
            if isinstance(x, (tuple, list)):
                x[0] *= 4
            elif isinstance(x, (dict)):
                x['input'] *= 3
            else:
                x *= 2

    poptorch_model = poptorch.inferenceModel(Model())
    tensor_in = torch.Tensor([1.0])
    assert poptorch_model(tensor_in) is None
    assert tensor_in == 2.0
    assert poptorch_model(tensor_in) is None
    assert tensor_in == 4.0
    assert poptorch_model(torch.Tensor([1.0])) is None
    assert tensor_in == 4.0

    # We're changing the  input type: must recompile
    poptorch_model.destroy()
    list_in = [torch.Tensor([1.0])]
    cpu_in = [torch.Tensor([1.0])]
    model = Model()
    for i in range(2):
        print(f"Run {i}")
        cpu_out = model(cpu_in)
        poptorch_out = poptorch_model(list_in)
        assert cpu_out == poptorch_out
        assert list_in == cpu_in


def test_inplace_masked_fill():
    class Model(nn.Module):
        def forward(self, x):
            x.masked_fill_(x > 0.5, 1.0)

    poptorch_model = poptorch.inferenceModel(Model())
    x = torch.tensor([[0, 0.7], [0.2, 3.5]])
    poptorch_model(x)

    assert x[0][0] == 0
    assert x[0][1] == 1.0
    assert x[1][0] == 0.2
    assert x[1][1] == 1.0


def test_chained_inplace():
    class Model(nn.Module):
        def forward(self, x, y):
            x += y
            x += 2.0
            x += y

    model = Model()
    t1 = torch.tensor([1.])
    cpu_t1 = torch.tensor([1.])
    t2 = torch.tensor([2.])
    poptorch_model = poptorch.inferenceModel(model)
    out = model(cpu_t1, t2)
    assert out is None
    out = poptorch_model(t1, t2)
    assert out is None
    assert cpu_t1 == 7.0
    assert t1 == 7.0


def test_inplace_zero():
    class Model(nn.Module):
        def forward(self, x):
            # (Simply setting it to zero gets pruned by PopART)
            a = torch.sum(x)
            x.zero_()
            x += a

    poptorch_model = poptorch.inferenceModel(Model())
    x = torch.tensor([[0, 0.5], [0.25, 2.0]])
    poptorch_model(x)

    assert x[0][0] == 2.75
    assert x[0][1] == 2.75
    assert x[1][0] == 2.75
    assert x[1][1] == 2.75


def test_inplace_fill():
    class Model(nn.Module):
        def forward(self, x):
            a = torch.sum(x)
            x.fill_(1.0)
            x += a

    poptorch_model = poptorch.inferenceModel(Model())
    x = torch.tensor([[0, 0.5], [0.25, 2.0]])
    poptorch_model(x)

    assert x[0][0] == 3.75
    assert x[0][1] == 3.75
    assert x[1][0] == 3.75
    assert x[1][1] == 3.75


def test_inplace_non_input():
    class Model(nn.Module):
        def forward(self, x):
            a = x + 1
            a += 1
            return a

    poptorch_model = poptorch.inferenceModel(Model())
    x = torch.tensor([[0, 0.5], [0.25, 2.0]])

    y = poptorch_model(x)

    assert x[0][0] == 0
    assert x[0][1] == 0.5
    assert x[1][0] == 0.25
    assert x[1][1] == 2.0

    assert y[0][0] == 2
    assert y[0][1] == 2.5
    assert y[1][0] == 2.25
    assert y[1][1] == 4.0


def test_double_underscore():
    # This tests aten::__and__ is not treated as inplace

    class Model(nn.Module):
        def forward(self, x, l):

            return x[0].int() & l.int()

    model = Model()
    poptorch_model = poptorch.inferenceModel(model)
    inp, l = torch.rand(10, 10), torch.LongTensor([10])

    out = model(inp, l)
    popout = poptorch_model(inp, l)

    helpers.assert_allclose(actual=popout, expected=out)


def test_half_buffer_inplace():
    class Model(nn.Module):
        def __init__(self):
            super().__init__()
            self.register_buffer('buff', torch.ones(5, dtype=torch.float16))

        def forward(self, x):
            # pylint: disable=no-member
            out = x + self.buff
            self.buff += 1
            return out

    model = Model()
    poptorch_model = poptorch.inferenceModel(model)

    x = torch.tensor([0.1, 0.2, 0.3, 0.4, 0.5], dtype=torch.float16)
    out = poptorch_model(x)

    helpers.assert_allclose(actual=out,
                            expected=torch.tensor([1.1, 1.2, 1.3, 1.4, 1.5],
                                                  dtype=torch.float16))
    poptorch_model.copyWeightsToHost()
    helpers.assert_allclose(actual=poptorch_model.buff,
                            expected=torch.tensor([2.0, 2.0, 2.0, 2.0, 2.0],
                                                  dtype=torch.float16))


def test_float_to_half_buffer_inplace_with_training():
    torch.manual_seed(42)

    # pylint: disable=attribute-defined-outside-init
    class Model(nn.Module):
        def __init__(self):
            super().__init__()

            # need at least one parameter for a training model
            self.param = nn.Parameter(torch.ones(5, 5))

            self.register_buffer("buff", torch.ones(5))
            self.loss = nn.MSELoss()

        def forward(self, x):
            # pylint: disable=no-member
            out = self.buff + self.param
            self.buff += 1
            return out, self.loss(out, x)

    model = Model().train().half()
    poptorch_model = poptorch.trainingModel(model)

    x = torch.rand(5, 5).half()
    native_out, native_loss = model(x)

    # Reset buff
    model.buff = torch.ones(5, 5)

    poptorch_out, poptorch_loss = poptorch_model(x)

    helpers.assert_allclose(actual=native_out, expected=poptorch_out)
    helpers.assert_allclose(actual=native_loss, expected=poptorch_loss)


def test_inplace_on_buffer_and_input():
    fill_value = 3
    shape = (1, 2)

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.register_buffer("buffer", torch.ones(shape))

        def forward(self, x):
            # Perform inplace ops on both the input and our buffer.
            x.fill_(fill_value)

            buffer_update = self.buffer + x
            self.buffer.copy_(buffer_update)

            return self.buffer, x

    model = poptorch.inferenceModel(Model())

    buf, out = model(torch.ones(shape))

    expected_out = torch.full(shape, fill_value)
    expected_buf = expected_out + 1

    helpers.assert_allequal(actual=out, expected=expected_out)
    helpers.assert_allequal(actual=buf, expected=expected_buf)


def test_two_inplace_copies():
    fill_value = 3
    shape = (1, 2)

    class Model(torch.nn.Module):
        def forward(self, x):
            res = torch.full(shape, fill_value)
            x.copy_(res)

            # Do a second `copy_` to our input.
            res += 3
            x.copy_(res)

            return x

    model = poptorch.inferenceModel(Model())

    out = model(torch.ones(shape))

    expected_out = torch.full(shape, fill_value) + 3

    helpers.assert_allequal(actual=out, expected=expected_out)


def test_two_inplace_copies_buffer():
    fill_value = 3
    shape = (1, 2)

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.register_buffer("buffer", torch.ones(shape))

        def forward(self, x):
            x.fill_(fill_value)

            buffer_update = self.buffer + x
            self.buffer.copy_(buffer_update)

            # Do a second `copy_` to our buffer.
            buffer_update += 5
            self.buffer.copy_(buffer_update)

            return self.buffer, x

    model = poptorch.inferenceModel(Model())

    buf, out = model(torch.ones(shape))

    expected_out = torch.full(shape, fill_value)
    expected_buf = expected_out + 6

    helpers.assert_allequal(actual=out, expected=expected_out)
    helpers.assert_allequal(actual=buf, expected=expected_buf)


def direct_assign(x, step):
    x[0:2:step.item()] = x[0:2:step.item()] * 0
    return x


def direct_assign_inplace(x, step):
    x[0:2:step.item()] *= 0
    return x


def direct_fill(x, step):
    x[0:2:step.item()] = 0
    return x


# Slicing entire dimensions lowers to slice(slice(x))
def chained_slice(x, step):
    x[:, :2:step.item()].mul_(0)
    return x


def modify_before_assign(x, step):
    x *= 2
    x[0:2:step.item()] = x[0:2:step.item()] * 0
    return x


def modify_region(x, step):
    x[1:x.shape[0]:step.item(), :] += 1
    return x


@pytest.mark.parametrize("step_size", [1, 2])
@pytest.mark.parametrize("op", [
    direct_assign, direct_assign_inplace, direct_fill, chained_slice,
    modify_before_assign, modify_region
])
def test_inplace_modify_slice(op, step_size):
    t = torch.rand(4, 4)
    step = torch.tensor(step_size)

    class Model(torch.nn.Module):
        pass

    Model.forward = lambda _, x: op(x, step)

    cpu_model = Model()
    ipu_model = poptorch.inferenceModel(cpu_model)

    if step_size == 1:
        ipu_input = t.clone()
        cpu_input = t.clone()
        # Ensure outputs match
        helpers.assert_allclose(actual=ipu_model(ipu_input),
                                expected=cpu_model(cpu_input))
        # Ensure that any inplace modification of graph inputs is
        # correctly reflected
        helpers.assert_allclose(actual=ipu_input, expected=cpu_input)
    else:
        try:
            ipu_model.compile(t)
        except poptorch.Error as e:
            assert re.match(
                r"In\-place modification of slices with step "
                r"size other than 1 is not supported\.", e.message)


def test_inplace_modify_select():
    shape = (3, 4, 2)

    inpA = torch.randint(55, shape)
    inpB = torch.randint(66, shape)
    inpC = torch.randint(77, shape)

    class ModelWrapper(torch.nn.Module):
        def forward(self, tensorA, tensorB, tensorC):
            tensorA = tensorA - tensorB

            tensorA[0:1] += tensorC[1]
            tensorA[0] += tensorC[0]
            tensorA[1][2] += tensorC[2][1]
            tensorA[1][3][1] += tensorC[2][3][0]

            return tensorA

    model = ModelWrapper()

    cpu_out = model(inpA, inpB, inpC)

    poptorch_model = poptorch.inferenceModel(model)
    ipu_out = poptorch_model(inpA, inpB, inpC)

    helpers.assert_allclose(actual=ipu_out, expected=cpu_out)


def test_index_put_on_buffer():
    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            p_init = torch.arange(6, dtype=torch.float).reshape(2, 3)
            self.register_buffer("p", p_init)

        def forward(self, x, idx):
            self.p[(idx, )] = x
            return self.p

    model = Model()
    ipu_model = poptorch.inferenceModel(Model())

    x = torch.empty(3).fill_(-1)
    idx = torch.tensor([0])
    cpu_out = model(x, idx)
    ipu_out = ipu_model(x, idx)
    helpers.assert_allclose(actual=ipu_out, expected=cpu_out)


================================================
FILE: tests/inputs_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import collections
import re
import torch
import torch.nn as nn
import pytest
import helpers
import poptorch


@pytest.mark.parametrize("use_half", [True, False])
def test_simple_tuple(use_half):
    class SimpleAdder(nn.Module):
        def forward(self, t):
            assert isinstance(t, tuple)
            (x, y) = t
            assert isinstance(x, torch.Tensor)
            assert isinstance(y, torch.Tensor)
            return x + y

    model = SimpleAdder()
    inference_model = poptorch.inferenceModel(model)

    t1 = torch.tensor([1.])
    t2 = torch.tensor([2.])

    if use_half:
        model.half()
        t1 = t1.half()
        t2 = t2.half()
    assert inference_model((t1, t2)).float() == 3.0
    # Run more than once
    assert inference_model((t1, t2)).float() == 3.0


def test_type_change():
    class SimpleAdder(nn.Module):
        def forward(self, t):
            assert isinstance(t, tuple)
            (x, y) = t
            assert isinstance(x, torch.Tensor)
            assert isinstance(y, torch.Tensor)
            return x + y

    model = SimpleAdder()
    inference_model = poptorch.inferenceModel(model)

    t1 = torch.tensor([1.])
    t2 = torch.tensor([2.])
    assert inference_model((t1, t2)).float() == 3.0
    # Run more than once
    assert inference_model((t1, t2)).float() == 3.0

    t1 = torch.tensor([1])
    t2 = torch.tensor([2])
    error_msg = (".*expected torch.float32 but got torch.int64.*")
    with pytest.raises(poptorch.Error, match=error_msg):
        assert inference_model((t1, t2)).float() == 3

    inference_model.destroy()
    assert inference_model((t1, t2)).float() == 3


def test_shape_change():
    class SimpleAdder(nn.Module):
        def forward(self, t):
            assert isinstance(t, tuple)
            (x, y) = t
            assert isinstance(x, torch.Tensor)
            assert isinstance(y, torch.Tensor)
            return x + y

    model = SimpleAdder()
    inference_model = poptorch.inferenceModel(model)

    t1 = torch.tensor([1.])
    t2 = torch.tensor([2.])
    assert inference_model((t1, t2)).float() == 3.0
    # Run more than once
    assert inference_model((t1, t2)).float() == 3.0

    t1 = torch.tensor([1., 1.])
    t2 = torch.tensor([2., 2.])
    error_msg = ("expected torch.Size([1]) but got torch.Size([2])")
    with pytest.raises(poptorch.Error, match=re.escape(error_msg)):
        assert inference_model((t1, t2)).float() == 3

    inference_model.destroy()
    native_out = model((t1, t2))
    for i in range(2):
        print(f"Run {i}")
        poptorch_out = inference_model((t1, t2))
        helpers.assert_allclose(expected=native_out, actual=poptorch_out)


@pytest.mark.parametrize("use_half", [True, False])
@pytest.mark.parametrize("thing_to_test", ['List', 'Tuple', 'Mixed'])
def test_nested_tuples_and_lists(use_half, thing_to_test):
    class SimpleAdder(nn.Module):
        def forward(self, tpl1, t2, tpl34567):
            (t1, ) = tpl1
            (t3, (t4, t5), _) = tpl34567
            (t6, _) = tpl34567[2]
            t7 = tpl34567[2][1]

            assert isinstance(t1, torch.Tensor)
            assert isinstance(t2, torch.Tensor)
            assert isinstance(t3, torch.Tensor)
            assert isinstance(t4, torch.Tensor)
            assert isinstance(t5, torch.Tensor)
            assert isinstance(t6, torch.Tensor)
            assert isinstance(t7, torch.Tensor)

            return t1 + t2 + t3 + t4 + t5 + t6 + t7

    model = SimpleAdder()
    inference_model = poptorch.inferenceModel(model)

    t1 = torch.tensor([1.])
    t2 = torch.tensor([2.])
    t3 = torch.tensor([3.])
    t4 = torch.tensor([4.], dtype=torch.float64)
    t5 = torch.tensor([5.])
    t6 = torch.tensor([6.])
    t7 = torch.tensor([7.], dtype=torch.float64)

    if use_half:
        model.half()
        t1 = t1.half()
        t2 = t2.half()
        t3 = t3.half()
        t4 = t4.half()
        t5 = t5.half()
        t6 = t6.half()
        t7 = t7.half()

    # Run more than once
    for i in range(2):
        print(f"Run {i}")
        if thing_to_test == "List":
            assert inference_model([
                t1,
            ], t2, [t3, [t4, t5], [t6, t7]]).float() == 28.0
        elif thing_to_test == "Tuple":
            assert inference_model((t1, ), t2,
                                   (t3, (t4, t5), (t6, t7))).float() == 28.0
        else:
            assert inference_model([
                t1,
            ], t2, [t3, (t4, t5), [t6, t7]]).float() == 28.0


@pytest.mark.parametrize("use_half", [True, False])
def test_optional_inputs(use_half):
    dtype = torch.float16 if use_half else torch.float32

    class SimpleAdder(nn.Module):
        def forward(self,
                    t1,
                    t2,
                    t3=torch.ones(1, dtype=dtype),
                    t4=torch.zeros(1, dtype=dtype)):
            return t1 * t3 + t2 * t4

    model = SimpleAdder()
    inference_model = poptorch.inferenceModel(model)

    t1 = torch.tensor([1.])
    t2 = torch.tensor([2.])
    t4 = torch.tensor([4.])

    if use_half:
        model.half()
        t1 = t1.half()
        t2 = t2.half()
        t4 = t4.half()

    assert inference_model(t1, t2).float() == 1.0
    assert inference_model(t1, t2, t4=t4).float() == 9.0
    assert inference_model(t4=t4, t1=t1, t2=t2).float() == 9.0


def test_non_tensor_inputs_dispatch():
    class Model(nn.Module):
        def forward(
                self,
                t1,
                scalar=2,
                t2_opt=None,
        ):
            if t2_opt is not None:
                return t2_opt * scalar + t1 * scalar
            return t1 * scalar

    model = Model()

    t1 = torch.tensor([3.])
    ipu = poptorch.inferenceModel(model)(t1)
    cpu = model(t1)
    helpers.assert_allclose(expected=cpu, actual=ipu)

    scalar = 4
    ipu = poptorch.inferenceModel(model)(t1, scalar)
    cpu = model(t1, scalar)
    helpers.assert_allclose(expected=cpu, actual=ipu)

    t2 = torch.tensor([5.])
    ipu = poptorch.inferenceModel(model)(t1, scalar, t2)
    cpu = model(t1, scalar, t2)
    helpers.assert_allclose(expected=cpu, actual=ipu)

    ipu = poptorch.inferenceModel(model)(t1, t2_opt=t2)
    cpu = model(**{"t1": t1, "t2_opt": t2})
    helpers.assert_allclose(expected=cpu, actual=ipu)


@pytest.mark.parametrize("use_half", [True, False])
def test_list_inputs(use_half):
    class SimpleAdder(nn.Module):
        def forward(self, t1, t2, x):
            l = [t1, t2]
            x = l[0] + x
            l[1] = x
            return l

    model = SimpleAdder()
    inference_model = poptorch.inferenceModel(model)

    t1 = torch.tensor([1.])
    t2 = torch.tensor([2.])
    t3 = torch.tensor([4.])

    if use_half:
        model.half()
        t1 = t1.half()
        t2 = t2.half()
        t3 = t3.half()

    expected = [torch.tensor([1.0]), torch.tensor([5.0])]

    assert [t.float() for t in inference_model(t1, t2, t3)] == expected

    # Call multiple times to check the fast path works
    assert [t.float() for t in inference_model(t1, t2, t3)] == expected
    assert [t.float() for t in inference_model(t1, t2, t3)] == expected
    assert [t.float() for t in inference_model(t1, t2, t3)] == expected


def test_unused_tuple():
    class SimpleAdder(nn.Module):
        def forward(self, x, y, z):  # pylint: disable=unused-argument
            return x + y

    model = SimpleAdder()
    inference_model = poptorch.inferenceModel(model)
    t1 = torch.tensor([1.])
    t2 = torch.tensor([2.])
    z = (torch.tensor([1.]), torch.tensor([1.]))
    # Run more than once
    for i in range(2):
        print(f"Run {i}")
        inference_model(t1, t2, z)


def test_dict_input():
    class DictDivider(nn.Module):
        def forward(self, d):  # pylint: disable=unused-argument
            return d['x'] / d['y']

    model = DictDivider()
    z = {'x': torch.tensor([1.]), 'y': torch.tensor([2.])}
    native_out = model(z)
    inference_model = poptorch.inferenceModel(model)

    # Run more than once
    for i in range(4):
        # Reorder the dict to check order doesn't matter
        if i == 1:
            z = {'y': torch.tensor([2.]), 'x': torch.tensor([1.])}
        # Missing argument
        elif i == 2:
            z = {'y': torch.tensor([2.])}
            with pytest.raises(poptorch.Error, match="Missing arguments: x."):
                inference_model(z)
            continue
        # Extra argument
        elif i == 3:
            z = {
                'x': torch.tensor([1.]),
                'y': torch.tensor([2.]),
                'z': torch.tensor([3.])
            }
            with pytest.raises(poptorch.Error,
                               match="Unexpected arguments: z."):
                inference_model(z)
            continue
        poptorch_out = inference_model(z)
        helpers.assert_allclose(expected=native_out, actual=poptorch_out)


def test_nested_dict_input():
    class DictAdder(nn.Module):
        def forward(self, d):  # pylint: disable=unused-argument
            return d[0]['d']['x'] + d[0]['d']['y'] + d[1]

    model = DictAdder()
    z = [{
        'd': {
            'x': torch.tensor([1.]),
            'y': torch.tensor([2.])
        }
    },
         torch.tensor([3.])]
    native_out = model(z)
    inference_model = poptorch.inferenceModel(model)

    # Run more than once
    for i in range(2):
        print(f"Run {i}")
        poptorch_out = inference_model(z)
        helpers.assert_allclose(expected=native_out, actual=poptorch_out)


@pytest.mark.parametrize("fwd_args", [True, False])
def test_ordered_dict_inputs(fwd_args):
    """ OrderedDict based types shouldn't require any custom parser."""

    class MyArg(collections.OrderedDict):
        def print(self):
            return str(self)

    class Model(torch.nn.Module):
        def forward(self, args):
            assert isinstance(args, MyArg)
            return args["a"] * 2 + poptorch.ipu_print_tensor(args["b"])

    class ModelWrapper(Model):
        def forward(self, *args, **kwargs):
            print(len(args))
            return super().forward(*args, **kwargs)

    if fwd_args:
        model = ModelWrapper()
    else:
        model = Model()

    poptorch_model = poptorch.inferenceModel(model)

    for i in range(2):
        print(f"Run {i}")
        args = MyArg()
        args["b"] = torch.randn(2, 2)
        args["a"] = torch.randn(2, 2)
        native_out = model(args)
        poptorch_out = poptorch_model(args)
        helpers.assert_allclose(expected=native_out, actual=poptorch_out)


@pytest.mark.parametrize("device_iterations", [1, 3])
def test_custom_input(device_iterations):
    batch_size = 2
    combined_batch_size = device_iterations * batch_size

    class MyArg:
        def __init__(self, **kwargs):
            for key, value in kwargs.items():
                self.__setattr__(key, value)

    class MyArgParser(poptorch.ICustomArgParser):
        def yieldTensors(self, struct):
            keys = sorted(struct.__dict__.keys())
            for key in keys:
                maybe_tensor = struct.__dict__[key]
                if isinstance(maybe_tensor, torch.Tensor):
                    yield maybe_tensor

        def reconstruct(self, structure, tensor_iterator):
            data = {}
            keys = sorted(structure.__dict__.keys())
            for key in keys:
                data[key] = next(tensor_iterator)
            return MyArg(**data)

    poptorch.registerCustomArgParser(MyArg, MyArgParser())

    class SimpleAdder(torch.nn.Module):
        def forward(self, custom_input):
            assert custom_input.tensor.shape[0] == batch_size
            custom_input.result = custom_input.tensor + custom_input.tensor
            return custom_input

    adder_model = SimpleAdder()
    adder_model.eval()
    opts = poptorch.Options()
    opts.deviceIterations(device_iterations=device_iterations)

    ipu_adder_model = poptorch.inferenceModel(adder_model, opts)

    for i in range(4):
        input = torch.full((combined_batch_size, 1), i)
        result = ipu_adder_model(MyArg(tensor=input))
        assert torch.equal(result.tensor, input)
        assert torch.equal(result.result,
                           torch.full((combined_batch_size, 1), i + i))


torch.manual_seed(42)
ones = torch.ones(5, 5)
x = torch.randn(5, 5)
y = torch.randn(5, 5)
z = torch.randn(5, 5)
t = torch.randn(5, 5)


class Model(torch.nn.Module):
    def forward(self, x, y=None, z=None, t=None):
        r = x
        if y is not None:
            r = torch.add(r, y) * 3
        if z is not None:
            r = torch.add(r, z) * 4
        if t is not None:
            r = torch.add(r, t) * 5
        return torch.tanh(r)


def test_none_input_pass_one_kwarg():
    model = Model()
    poptorch_model = poptorch.inferenceModel(model)

    native_out = model(x, y, z, t=None)
    # Run more than once
    for i in range(2):
        print(f"Run {i}")
        poptorch_out = poptorch_model(x, y, z, t=None)
        helpers.assert_allclose(expected=native_out, actual=poptorch_out)


def test_none_input_pass_two_kwarg():
    model = Model()
    poptorch_model = poptorch.inferenceModel(model)

    native_out = model(x, y, z=None, t=None)
    # Run more than once
    for i in range(2):
        print(f"Run {i}")
        poptorch_out = poptorch_model(x, y, z=None, t=None)
        helpers.assert_allclose(expected=native_out, actual=poptorch_out)


def test_none_input_pass_skip_one_kwarg():
    model = Model()
    poptorch_model = poptorch.inferenceModel(model)

    native_out = model(x, y, z=None)
    # Run more than once
    for i in range(2):
        print(f"Run {i}")
        poptorch_out = poptorch_model(x, y, z=None)
        helpers.assert_allclose(expected=native_out, actual=poptorch_out)


def test_none_input_trace_dispatch_non_default_kwarg():
    model = Model()
    poptorch_model = poptorch.inferenceModel(model)

    native_out = model(x, y=None)
    # Run more than once
    for i in range(2):
        print(f"Run {i}")
        poptorch_out = poptorch_model(x, y=None)
        helpers.assert_allclose(expected=native_out, actual=poptorch_out)


def test_none_input_pass_last_arg():
    model = Model()
    poptorch_model = poptorch.inferenceModel(model)

    native_out = model(x, y, z, None)
    # Run more than once
    for i in range(2):
        print(f"Run {i}")
        poptorch_out = poptorch_model(x, y, z, None)
        helpers.assert_allclose(expected=native_out, actual=poptorch_out)


def test_none_input_pass_two_arg():
    model = Model()
    poptorch_model = poptorch.inferenceModel(model)

    native_out = model(x, y, None, None)
    # Run more than once
    for i in range(2):
        print(f"Run {i}")
        poptorch_out = poptorch_model(x, y, None, None)
        helpers.assert_allclose(expected=native_out, actual=poptorch_out)


@pytest.mark.parametrize("args", [(x, None, None, None), (x, ), (x, None)])
@pytest.mark.parametrize("fwd_args", [True, False])
def test_none_input_dispatch_non_default_arg_tuples(args, fwd_args):
    class ModelWrapper(Model):
        def forward(self, *args, **kwargs):  # pylint: disable=signature-differs
            return super().forward(*args, **kwargs)

    if fwd_args:
        model = ModelWrapper()
    else:
        model = Model()
    poptorch_model = poptorch.inferenceModel(model)

    native_out = model(*args)
    # Run more than once
    for i in range(2):
        print(f"Run {i}")
        poptorch_out = poptorch_model(*args)
        helpers.assert_allclose(expected=native_out, actual=poptorch_out)


@pytest.mark.parametrize("args", [{
    "x": x,
    "t": t
}, {
    "z": z,
    "t": None,
    "x": x
}])
@pytest.mark.parametrize("fwd_args", [True, False])
def test_none_input_dispatch_non_default_arg_dict(args, fwd_args):
    class ModelWrapper(Model):
        def forward(self, *args, **kwargs):  # pylint: disable=signature-differs
            return super().forward(*args, **kwargs)

    if fwd_args:
        model = ModelWrapper()
    else:
        model = Model()
    poptorch_model = poptorch.inferenceModel(model)

    native_out = model(**args)
    # Run more than once
    for i in range(2):
        print(f"Run {i}")
        poptorch_out = poptorch_model(**args)
        helpers.assert_allclose(expected=native_out, actual=poptorch_out)


@pytest.mark.parametrize("fwd_args", [True, False])
def test_custom_arg_parser(fwd_args):
    class MyArg:
        def __init__(self, a, b):
            self.a = a
            self.b = b

    class MyParser(poptorch.ICustomArgParser):
        def yieldTensors(self, struct) -> None:
            yield struct.a
            yield struct.b

        def reconstruct(self, _original_structure, tensor_iterator):
            return MyArg(next(tensor_iterator), next(tensor_iterator))

    class OutputContainer(collections.OrderedDict):
        def print(self):
            return str(self)

    poptorch.registerCustomArgParser(MyArg, MyParser())

    class Model(torch.nn.Module):
        def forward(self, args):
            # Make sure to use a poptorch specific op
            # to check the graph is not empty or running on the CPU
            out = OutputContainer()
            out["sum"] = args.a + poptorch.ipu_print_tensor(args.b)
            out["a"] = args.a
            return out

    class ModelWrapper(Model):
        def forward(self, *args, **kwargs):
            print(len(args))
            return super().forward(*args, **kwargs)

    if fwd_args:
        model = ModelWrapper()
    else:
        model = Model()

    poptorch_model = poptorch.inferenceModel(model)

    args = MyArg(torch.randn(2, 2), torch.randn(2, 2))
    for i in range(2):
        print(f"Run {i}")
        args = MyArg(torch.randn(2, 2), torch.randn(2, 2))
        native_out = model(args)
        poptorch_out = poptorch_model(args)
        # Make sure we get an OutputContainer and the elements are in the same order
        assert isinstance(native_out, OutputContainer)
        assert isinstance(poptorch_out, OutputContainer)
        print(native_out.print())
        print(poptorch_out.print())
        for native_key, poptorch_key in zip(native_out, poptorch_out):
            assert native_key == poptorch_key
            helpers.assert_allclose(expected=native_out[native_key],
                                    actual=poptorch_out[poptorch_key])


@pytest.mark.parametrize("fwd_args", [True, False])
def test_none_input_dispatch_args_kwargs(fwd_args):
    class Model(torch.nn.Module):
        def forward(self, a, b, *c, y=None, z=None, t=None, u=3, v="op", **w):
            r = len(v) * b + a * len(w)
            for i, x in enumerate(c):
                r += (i + 1) * x
            if y is not None:
                r = torch.add(r, y) * 3
            if z is not None:
                r = torch.add(r, z) * 4
            if t is not None:
                r = torch.add(r, t) * 5
            return u * r

    class ModelWrapper(Model):
        def forward(self, *args, **kwargs):
            print(len(args))
            return super().forward(*args, **kwargs)

    if fwd_args:
        model = ModelWrapper()
    else:
        model = Model()
    poptorch_model = poptorch.inferenceModel(model)

    a = 2
    b = torch.randn(2, 2)
    c = torch.randn(2, 2)
    d = torch.randn(2, 2)
    e = torch.randn(2, 2)
    t = torch.randn(2, 2)
    x = torch.randn(2, 2)
    m = torch.randn(2, 2)
    z = torch.randn(2, 2)

    native_out = model(a, b, c, d, e, t=t, x=x, m=m, z=z)
    for i in range(2):
        print(f"Run {i}")
        poptorch_out = poptorch_model(a, b, c, d, e, t=t, x=x, m=m, z=z)
        helpers.assert_allclose(expected=native_out, actual=poptorch_out)

    if fwd_args:
        expected = "Missing arguments: z."
    else:
        expected = "Type mismatch for z: expected .*Tensor.* but got .*None"
    with pytest.raises(poptorch.Error, match=expected):
        poptorch_out = poptorch_model(a, b, c, d, e, t=t, x=x, m=m)

    with pytest.raises(poptorch.Error, match="Missing arguments: m."):
        poptorch_out = poptorch_model(a, b, c, d, e, t=t, x=x, z=z)

    poptorch_model.destroy()
    native_out = model(a, b, c, d, e, t=t, x=x, m=m, z=z, u=5, v="foobar")
    for i in range(2):
        print(f"Run {i}")
        poptorch_out = poptorch_model(a,
                                      b,
                                      c,
                                      d,
                                      e,
                                      t=t,
                                      x=x,
                                      m=m,
                                      z=z,
                                      u=5,
                                      v="foobar")
        helpers.assert_allclose(expected=native_out, actual=poptorch_out)

    with pytest.raises(poptorch.Error,
                       match="mismatch for u: expected 5 but got 3"):
        poptorch_out = poptorch_model(a,
                                      b,
                                      c,
                                      d,
                                      e,
                                      t=t,
                                      x=x,
                                      m=m,
                                      z=z,
                                      u=3,
                                      v="foobar")

    with pytest.raises(
            poptorch.Error,
            match=("Number of positional arguments mismatch: expected"
                   " 5 arguments but got 4")):
        poptorch_model(a, b, c, e, t=t, x=x, m=m, z=z, u=5, v="foobar")

    with pytest.raises(
            poptorch.Error,
            match=("Number of positional arguments mismatch: expected "
                   "5 arguments but got 2")):
        poptorch_model(a, b, t=t, x=x, m=m, z=z, u=5, v="foobar")

    poptorch_model.destroy()
    if fwd_args:
        error_type = TypeError
        error = "missing 1 required positional argument: 'b'"
    else:
        error_type = poptorch.Error
        error = "Mandatory parameter b missing"

    with pytest.raises(error_type, match=error):
        poptorch_model(a)

    native_out = model(a, b)
    for i in range(2):
        print(f"Run {i}")
        poptorch_out = poptorch_model(a, b)
        helpers.assert_allclose(expected=native_out, actual=poptorch_out)


def test_no_inputs_no_output():
    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.x = torch.tensor([1.], dtype=torch.float)

        def forward(self):
            self.x += self.x

    model = Model()
    poptorch_model = poptorch.inferenceModel(model)
    poptorch_model()
    poptorch_model()


def test_return_and_use_input():
    class Model(torch.nn.Module):
        def forward(self, input):
            c = torch.tensor([1.])
            return c, input + c

    model = Model()
    poptorch_model = poptorch.inferenceModel(model)
    assert poptorch_model(torch.tensor([0.])) == (torch.tensor([1.]),
                                                  torch.tensor([1.]))
    assert poptorch_model(torch.tensor([1.])) == (torch.tensor([1.]),
                                                  torch.tensor([2.]))


def test_return_and_use_nested_input():
    class Model(torch.nn.Module):
        def forward(self, input):
            c = torch.tensor([1.])

            c = poptorch.set_available_memory(c, 0.1)

            return c, (c, input + c)

    model = Model()
    poptorch_model = poptorch.inferenceModel(model)
    assert poptorch_model(torch.tensor([0.])) == (torch.tensor([1.]),
                                                  (torch.tensor([1.]),
                                                   torch.tensor([1.])))
    assert poptorch_model(torch.tensor([1.])) == (torch.tensor([1.]),
                                                  (torch.tensor([1.]),
                                                   torch.tensor([2.])))


def test_scalar_tensor_input():
    class Square(torch.nn.Module):
        def forward(self, x):
            return x * x

    model = Square()
    s = poptorch.inferenceModel(model)
    x = torch.tensor(3.)  # shape = torch.Size([])
    helpers.assert_allclose(actual=s(x), expected=model(x))


def test_returned_only_inputs():
    class Model(torch.nn.Module):
        def forward(self, x, y, z):
            # x and y will become Identity ops inputs and will get passed out
            # as the model outputs
            return x, y, z + 0.0

    m = Model()
    p = poptorch.inferenceModel(m)
    x = torch.tensor([1, 2])
    y = torch.tensor([3, 4])
    z = torch.tensor([1.2, 3.4])

    for cpu_out, ipu_out in zip(m(x, y, z), p(x, y, z)):
        helpers.assert_allclose(actual=ipu_out, expected=cpu_out)

    x = torch.tensor([11, 12])
    y = torch.tensor([13, 14])
    z = torch.tensor([11.2, 13.4])

    for cpu_out, ipu_out in zip(m(x, y, z), p(x, y, z)):
        helpers.assert_allclose(actual=ipu_out, expected=cpu_out)


def test_returned_only_inputs_with_params():
    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            # Add parameter to ensure they're handled correctly
            self.lin = torch.nn.Linear(2, 1)

        def forward(self, z, x, y):
            # x and y will be erased as inputs and converted to
            # host-side-only constants
            return x, y, self.lin(z)

    m = Model()
    p = poptorch.inferenceModel(m)
    x = torch.tensor([1, 2])
    y = torch.tensor([3, 4])
    z = torch.tensor([1.2, 3.4])

    for cpu_out, ipu_out in zip(m(z, x, y), p(z, x, y)):
        helpers.assert_allclose(actual=ipu_out, expected=cpu_out)


================================================
FILE: tests/io_performance_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.
import os  # pylint: disable=unused-import
import unittest.mock
import numpy as np
import pytest
import torch
import helpers
import poptorch

IMAGE_SIZE = (3, 512, 512)
DATASET_SIZE = 1000
BATCH_SIZE = 16


class ImageDataset(torch.utils.data.Dataset):
    def __init__(self, io_dtype):
        super().__init__()
        self.io_dtype = io_dtype

    def __len__(self):
        return DATASET_SIZE

    def __getitem__(self, _):
        return torch.randint(0, 256, IMAGE_SIZE).to(self.io_dtype)


def get_mean_cycle_count(io_dtype, capfd):
    class Model(torch.nn.Module):
        def forward(self, x):
            x = x.to(torch.float32)
            x = x * 2
            return x.to(io_dtype)

    opts = poptorch.Options()
    opts.logCycleCount(True)
    data_loader = poptorch.DataLoader(
        opts,
        ImageDataset(io_dtype),
        BATCH_SIZE,
        shuffle=False,
        drop_last=True,
    )
    model = poptorch.inferenceModel(Model(), opts)

    num_iterations = 0
    for x in data_loader:
        num_iterations += 1
        _ = model(x)
    data_loader.terminate()

    log_matches = helpers.LogChecker(capfd).createIterator().findAll(
        r'Total number of IPU cycles: (\d+)')
    assert len(log_matches) == num_iterations

    cycle_counts = []
    for match in log_matches:
        cycle_counts.append(int(match.group(1)))
    return np.array(cycle_counts).mean()


@pytest.mark.ipuHardwareRequired
@pytest.mark.parametrize("io_dtype1,io_dtype2",
                         [(torch.float32, torch.int8),
                          (torch.float32, torch.uint8),
                          (torch.float32, torch.float16)])
@helpers.printCapfdOnExit
@unittest.mock.patch.dict("os.environ", helpers.disableAllModels())
@helpers.overridePoptorchLogLevel("DEBUG")
def test_compare_io_performance(capfd, io_dtype1, io_dtype2):
    cycle_count_1 = get_mean_cycle_count(io_dtype1, capfd)
    cycle_count_2 = get_mean_cycle_count(io_dtype2, capfd)
    # We only log the resulting cycle counts and ratios due to high variance
    # between the runs.
    print("test_compare_io_performance[{},{}],"
          "cycle_count1={}, cycle_count2={}, ratio={:.4f}".format(
              io_dtype1, io_dtype2, cycle_count_1, cycle_count_2,
              cycle_count_1 / cycle_count_2))


================================================
FILE: tests/ipu_print_tensor_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.

import torch
import pytest
import poptorch

match_str = [
    """title: {
 {1.4962566 1.7682219}
 {1.0884774 1.1320305}
}""", """title: [
 [1.4962566e+00,
   1.7682219e+00]
 [1.0884774e+00,
   1.1320305e+00]
]""", """title: (
 (1.4962566;1.7682219)
 (1.0884774;1.1320305)
)"""
]

brackets = {
    "parentheses": ("(", ")"),
    "square": ("[", "]"),
    "curly": ("{", "}")
}


@pytest.mark.parametrize(
    "title,print_gradient,summarise_threshold,edge_items,"
    "max_line_width,digits,float_format,separator,brackets_type,"
    "match_str_idx",
    [("title", True, 1000, 3, 75, 8, "auto", None, "curly", 0),
     ("title", True, 500, 2, 15, 8, "scientific", ",", "square", 1),
     ("title", True, 1500, 1, 125, 8, "fixed", ";", "parentheses", 2)])
def test_print_ipu_tensor(capfd, title, print_gradient, summarise_threshold,
                          edge_items, max_line_width, digits, float_format,
                          separator, brackets_type, match_str_idx):
    separator = " " if separator is None else separator

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.bias = torch.nn.Parameter(torch.zeros(()))

        def forward(self, x):
            x = x + 1
            x = poptorch.ipu_print_tensor(x, title, print_gradient,
                                          summarise_threshold, edge_items,
                                          max_line_width, digits, float_format,
                                          separator, *brackets[brackets_type])

            return x + self.bias

    poptorch_model = poptorch.inferenceModel(Model())

    torch.manual_seed(0)
    x = torch.rand((2, 2))

    _ = poptorch_model(x)

    captured = capfd.readouterr()

    # Very awkward to test this 'dynamically' so just test against some known
    # outputs above. Quite small tensors to test, but testing large ones would
    # be messy.

    assert match_str[match_str_idx] in captured.err


================================================
FILE: tests/loop_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import torch
import pytest
import helpers
import poptorch


def test_loop_constant():
    class Model(torch.nn.Module):
        def forward(self, x):
            def body(x):
                return x * 2

            return poptorch.for_loop(10, body, [x])[0]

    inference_model = poptorch.inferenceModel(Model())

    x = torch.tensor([1.])

    assert inference_model(x) == pow(2, 10)


def test_loop_simple():
    class Model(torch.nn.Module):
        def forward(self, x, y):
            def body(x):
                return x * y

            return poptorch.for_loop(10, body, [x])[0]

    inference_model = poptorch.inferenceModel(Model())

    x = torch.tensor([1.])
    y = torch.tensor([2.])
    assert inference_model(x, y) == pow(2, 10)


def test_loop_multiple_inputs():
    class Model(torch.nn.Module):
        def forward(self, x, y, z, w):
            def body(x, y, z, w):
                return x * y, y + z, x * w, w + 1

            return poptorch.for_loop(10, body, [x, y, z, w])

    inference_model = poptorch.inferenceModel(Model())

    x = torch.tensor([0.1])
    y = torch.tensor([0.2])
    z = torch.tensor([0.3])
    w = torch.tensor([0.4])

    out = inference_model(x, y, z, w)

    # Check by running equiv on host.
    x = torch.tensor([0.1])
    y = torch.tensor([0.2])
    z = torch.tensor([0.3])
    w = torch.tensor([0.4])

    for _ in range(0, 10):
        _z = x * w
        x *= y
        y += z
        w = w + 1
        z = _z

    for host, ipu in zip([x, y, z, w], out):
        assert host == ipu


def test_loop_non_tensor_in():
    class Model(torch.nn.Module):
        def forward(self, x, _):
            def body(x, y):
                return x * y, y + 1

            return poptorch.for_loop(10, body, [x, 5])

    inference_model = poptorch.inferenceModel(Model())

    x = torch.tensor([1.])
    y = torch.tensor([2.])

    msg = "(Object contained in list at index 1 is not torch.tensor)"
    with pytest.raises(ValueError, match=msg):
        inference_model(x, y)


def test_loop_non_list_in():
    class Model(torch.nn.Module):
        def forward(self, x, y):
            def body(x):
                return x * y

            return poptorch.for_loop(10, body, x)

    inference_model = poptorch.inferenceModel(Model())

    x = torch.tensor([1.])
    y = torch.tensor([2.])

    msg = "(Object is not list)"
    with pytest.raises(ValueError, match=msg):
        inference_model(x, y)


def test_loop_weights():
    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()

            self.layer1 = torch.nn.Linear(1, 256)
            self.layer2 = torch.nn.Conv2d(4, 1, [8, 8])

        def forward(self, x):
            def body(x):
                act = self.layer1(x)
                act = act.reshape([1, 4, 8, 8])
                act = self.layer2(act)
                return act.flatten()

            return poptorch.for_loop(2, body, [x])[0]

    inference_model = poptorch.inferenceModel(Model())

    x = torch.tensor([1.])

    inference_model(x)


def test_loop_weights_use_twice():
    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.layer1 = torch.nn.Linear(4, 4)

        def forward(self, x):
            def body(x):
                act = self.layer1(x)
                return self.layer1(act)

            return poptorch.for_loop(2, body, [x])

    inference_model = poptorch.inferenceModel(Model())

    x = torch.ones(1, 4).to(torch.float)
    inference_model(x)


def test_loop_use_output():
    class Model(torch.nn.Module):
        def forward(self, x):
            def body(x):
                return x + x

            out = poptorch.for_loop(2, body, [x])[0]
            loss = poptorch.identity_loss(out, reduction='sum')
            return out, loss

    inference_model = poptorch.inferenceModel(Model())

    x = torch.ones(1, 4).to(torch.float)
    inference_model(x)


def test_loop_training():
    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.layer1 = torch.nn.Linear(4, 4)

        def forward(self, x):
            def body(x):
                return self.layer1(x)

            out = poptorch.for_loop(2, body, [x])[0]
            loss = poptorch.identity_loss(out, reduction='sum')
            return out, loss

    training_model = poptorch.trainingModel(Model())

    x = torch.ones(1, 4).to(torch.float)
    with pytest.raises(
            poptorch.Error,
            match=r"poptorch.for_loop\(\) is only supported in inference"):
        training_model(x)


def test_loop_body_inplace_ops_1():
    class Model(torch.nn.Module):
        def forward(self, x):
            # Body inputs are passed by value so 'x' remains unchanged.
            def body(y):
                y += 1
                return y

            return poptorch.for_loop(3, body, [x])[0]

    poptorch_model = poptorch.inferenceModel(Model())
    x = torch.ones(1, 5).to(torch.int32)
    x_copy = torch.ones(1, 5).to(torch.int32)

    out = poptorch_model(x)
    helpers.assert_allequal(actual=x, expected=x_copy)
    helpers.assert_allequal(actual=out, expected=x_copy * 4)


def test_loop_body_inplace_ops_2():
    class Model(torch.nn.Module):
        def forward(self, x):
            # Body inputs are passed by value so 'x' remains unchanged.
            def body(y):
                y += 1
                y += 1
                return y

            return poptorch.for_loop(3, body, [x])[0]

    poptorch_model = poptorch.inferenceModel(Model())
    x = torch.ones(1, 5).to(torch.int32)
    x_copy = torch.ones(1, 5).to(torch.int32)

    out = poptorch_model(x)
    helpers.assert_allequal(actual=x, expected=x_copy)
    helpers.assert_allequal(actual=out, expected=x_copy * 7)


def test_loop_body_inplace_ops_3():
    class Model(torch.nn.Module):
        def forward(self, x):
            x += 1

            # Body inputs are passed by value so 'x' remains unchanged.
            def body(y):
                y += 1
                return y

            return poptorch.for_loop(3, body, [x])[0]

    poptorch_model = poptorch.inferenceModel(Model())
    x = torch.ones(1, 5).to(torch.int32)
    x_copy = torch.ones(1, 5).to(torch.int32)

    out = poptorch_model(x)
    helpers.assert_allequal(actual=x, expected=x_copy * 2)
    helpers.assert_allequal(actual=out, expected=x_copy * 5)


def test_loop_body_inplace_ops_4():
    class Model(torch.nn.Module):
        def forward(self, x):
            x += 1

            # Body inputs are passed by value so 'x' remains unchanged.
            def body(y):
                y += 1
                return y

            z = poptorch.for_loop(3, body, [x])[0]
            x += 1
            return z

    poptorch_model = poptorch.inferenceModel(Model())
    x = torch.ones(1, 5).to(torch.int32)
    x_copy = torch.ones(1, 5).to(torch.int32)

    out = poptorch_model(x)
    helpers.assert_allequal(actual=x, expected=x_copy * 3)
    helpers.assert_allequal(actual=out, expected=x_copy * 5)


def test_loop_with_constant_inputs_only():
    class Model(torch.nn.Module):
        def forward(self):
            # 't0' will be evaluated as part of constexpr folding.
            t0 = torch.tensor([0., 0.])
            t0 = t0 + 8
            # 't1' and 't2' must not be evaluated as part of constexpr folding.
            t1 = torch.tensor([1., 2.])
            t2 = torch.tensor([3., 4.])

            def func(x, y):
                x = x * 2
                y = y * x
                return x, y

            t1, t2 = poptorch.for_loop(5, func, [t1, t2])
            return t1, t0

    poptorch_model = poptorch.inferenceModel(Model())
    helpers.assert_allequal(actual=poptorch_model(),
                            expected=(torch.tensor([32., 64.]),
                                      torch.tensor([8., 8.])))


def test_loop_with_same_trip_count_on_multiple_ipus():
    class Model(torch.nn.Module):
        def forward(self, x, y):
            def func(x, y):
                x = x + y
                return x, y

            # Note: both trip_count equal to 5
            with poptorch.Block("0", ipu_id=0):
                x, y = poptorch.for_loop(5, func, [x, y])

            with poptorch.Block("1", ipu_id=1):
                x, y = poptorch.for_loop(5, func, [x, y])

            return x, y

    native = Model()
    stages = [poptorch.Stage(f"{k}") for k in range(0, 2)]
    strategy = poptorch.ShardedExecution(*stages)

    opts = poptorch.Options()
    opts.setExecutionStrategy(strategy)
    ipu = poptorch.inferenceModel(native, opts)

    x = torch.tensor([1., 2.])
    y = torch.tensor([1., 2.])

    ipu_out = ipu(x, y)[0]
    native_out = x + 10 * y

    helpers.assert_allclose(actual=ipu_out, expected=native_out)


================================================
FILE: tests/losses_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import random

import os  # pylint: disable=unused-import
import unittest.mock
import torch
import torch.optim as optim
import torch.nn.functional as F
import pytest
import helpers
import poptorch


def loss_harness(loss,
                 inputs,
                 target,
                 reduction,
                 op=None,
                 training=True,
                 **kwargs):
    def moveTo(structure, device):
        if isinstance(structure, dict):
            return {k: moveTo(v, device) for k, v in structure.items()}
        if torch.is_tensor(structure):
            return structure.to(device)
        return structure

    if len(inputs) == 1:
        loss_fn = lambda x: loss(x,
                                 moveTo(target, x.device),
                                 reduction=reduction,
                                 **moveTo(kwargs, x.device))

        if op is None:
            op = lambda x: x
    elif len(inputs) == 2:
        loss_fn = lambda x, y: loss(x,
                                    y,
                                    moveTo(target, x.device),
                                    reduction=reduction,
                                    **moveTo(kwargs, x.device))

        if op is None:
            op = lambda x, y: (x, y)

    else:
        assert len(inputs) == 3
        # The only supported loss fn with 3 inputs is TripletMarginLoss
        # which has no "target" per se
        loss_fn = lambda x, y, z: loss(
            x, y, z, reduction=reduction, **moveTo(kwargs, x.device))

        if op is None:
            op = lambda x, y, z: (x, y, z)

    model = helpers.ModelWithWeights(op, inputs[0].shape, loss_fn=loss_fn)

    poptorch_model = poptorch.trainingModel(
        model) if training else poptorch.inferenceModel(model)

    native_out, _ = model(tuple(inputs))
    poptorch_out, poptorch_loss = poptorch_model(tuple(inputs))

    # Inference test - check outputs
    helpers.assert_allclose(actual=poptorch_out, expected=native_out)

    if training:
        # Training test - check weights have changed
        poptorch_model.assert_weights_changed()

    # Return the poptorch model and original outputs for any further
    # testing
    return poptorch_model, poptorch_out, poptorch_loss


@pytest.mark.parametrize("reduction", ["mean", "sum"])
def test_L1Loss(reduction):
    torch.manual_seed(42)

    target = torch.randn(10)
    input = torch.randn(10)

    poptorch_model, original, original_loss = loss_harness(
        F.l1_loss, [input], target, reduction)

    # Make sure the first run doesn't already pass the test.
    assert original_loss > 0.1
    assert not torch.allclose(original, target, rtol=1e-02, atol=1e-02)

    for i in range(0, 1000):
        out, loss = poptorch_model((input, ))

        # Model needs to adjust the LR in the middle to converge
        if i == 500:
            poptorch_model.setOptimizer(
                optim.SGD(poptorch_model.model.parameters(), lr=0.001))

    # Check we have trained the "model"
    assert loss < original_loss

    # "sum" L1 losses tend to be very large compared to "mean"
    if reduction == "sum":
        assert loss < 0.1
    else:
        assert loss < 0.001

    helpers.assert_allclose(actual=out,
                            expected=target,
                            rtol=1e-02,
                            atol=1e-02)


@pytest.mark.parametrize("reduction", ["mean", "sum"])
def test_MSELoss(reduction):
    torch.manual_seed(42)

    target = torch.randn(10)
    input = torch.randn(10)

    poptorch_model, original, original_loss = loss_harness(
        F.mse_loss, [input], target, reduction)

    # Make sure the first run doesn't already pass the test
    assert original_loss > 0.1
    assert not torch.allclose(original, target, rtol=1e-02, atol=1e-02)

    for _ in range(0, 1000):
        out, loss = poptorch_model((input, ))

    # Check we have trained the "model"
    assert loss < 0.001
    helpers.assert_allclose(actual=out,
                            expected=target,
                            rtol=1e-02,
                            atol=1e-02)


cross_entropy_params = [
    # Input shape, reduction
    ((1, 10), "mean"),
    ((1, 10, 2), "sum"),
    ((1, 10, 2, 3), "mean"),
]


@pytest.mark.parametrize("input_shape, reduction", cross_entropy_params)
def test_CrossEntropy(input_shape, reduction):
    torch.manual_seed(42)

    input = torch.randn(input_shape)
    label_shape = [input_shape[0]]
    if len(input_shape) > 2:
        label_shape.extend(input_shape[2:])
    label = torch.randint(0, 10, label_shape)

    poptorch_model, _, original_loss = loss_harness(F.cross_entropy, [input],
                                                    label, reduction)

    for _ in range(0, 100):
        out, loss = poptorch_model((input, ))

    # Check we have trained the "model"
    assert loss < original_loss
    helpers.assert_allequal(actual=torch.argmax(out, dim=1), expected=label)


# Test softmax and logsoftmax for dimensions more than 2
def op_withdim(op, input):

    # Run on CPU.
    native_out = op(input)

    # Run on IPU.
    poptorch_model = poptorch.inferenceModel(op)
    poptorch_out = poptorch_model(input)

    helpers.assert_allclose(expected=native_out, actual=poptorch_out)


ops_float = [
    torch.nn.Softmax,
    torch.nn.LogSoftmax,
]


@unittest.mock.patch.dict("os.environ", helpers.disableSmallModel())
@pytest.mark.parametrize("op", ops_float)
@pytest.mark.parametrize("dim", range(-4, 3))
def test_op_withdim_4d(op, dim):
    N, C = 11, 22
    M, K = 33, 44
    torch.manual_seed(42)
    x = torch.randn(N, C, M, K)

    op_withdim(op(dim=dim), x)


@pytest.mark.parametrize("op", ops_float)
@pytest.mark.parametrize("dim", range(-2, 1))
def test_op_withdim_2d(op, dim):
    N, C = 17, 13
    torch.manual_seed(42)
    x = torch.randn(N, C)

    op_withdim(op(dim=dim), x)


# Test NLL loss by using it to match a target label.
@pytest.mark.parametrize("reduction", ["mean", "sum"])
def test_NLLLoss(reduction):
    torch.manual_seed(42)

    op = lambda x: F.log_softmax(x, dim=1)

    label = torch.randint(0, 10, [1])
    input = torch.randn(1, 10)

    poptorch_model, _, original_loss = loss_harness(F.nll_loss, [input], label,
                                                    reduction, op)

    for _ in range(0, 100):
        out, loss = poptorch_model((input, ))

    # Check we have trained the "model"
    assert loss < original_loss
    assert torch.argmax(out, dim=1) == label


# Test NLL loss 2d by using it to match a target label.
@pytest.mark.parametrize("reduction", ["mean", "sum"])
def test_NLLLoss2d(reduction):

    torch.manual_seed(42)
    N, C, M = 3, 2, 5

    op = lambda x: F.log_softmax(x, dim=1)

    y = torch.empty(N, M, M, dtype=torch.long).random_(0, C)
    x = torch.randn(N, C, M, M)

    poptorch_model, _, original_loss = loss_harness(F.nll_loss, [x], y,
                                                    reduction, op)

    for _ in range(0, 100):
        out, loss = poptorch_model((x, ))

    # Check we have trained the "model"
    assert loss < original_loss
    helpers.assert_allclose(actual=torch.argmax(out, dim=1), expected=y)


# This also servees as the NLL loss test as it uses NLL under the hood.
@pytest.mark.parametrize("reduction", ["mean", "sum"])
def test_BCE(reduction):
    torch.manual_seed(42)

    target = torch.empty(10).uniform_()
    input = torch.randn(10)

    poptorch_model, _, original_loss = loss_harness(F.binary_cross_entropy,
                                                    [input],
                                                    target,
                                                    reduction,
                                                    op=torch.sigmoid)

    # Make sure the first run doesn't already pass the test.
    _, original_loss = poptorch_model((input, ))

    for _ in range(0, 2500):
        out, loss = poptorch_model((input, ))

    # # Check we have trained the "model"
    assert loss < original_loss
    helpers.assert_allclose(actual=out,
                            expected=target,
                            rtol=1e-02,
                            atol=1e-02)


# TODO(T22975)
# This also servees as the NLL loss test as it uses NLL under the hood.
# Re-enable once pytorch fixes https://github.com/pytorch/pytorch/issues/40679
# def test_BCE_direct_with_weight():
#     reductions = ["mean", "sum"]
#     torch.manual_seed(42)

#     for reduction in reductions:

#         weight = torch.randn(10)
#         model = torch.nn.BCELoss(weight=weight, reduction=reduction)

#         poptorch_model = poptorch.inferenceModel(model)

#         for i in range(0, 10):
#             target = torch.empty(10, 10).random_(2)
#             input = torch.empty(10, 10).uniform_()

#             groundTruth = model(input, target)
#             poptorch_out = poptorch_model(input, target)
#             helpers.assert_allclose(expected=groundTruth, actual=poptorch_out)


@pytest.mark.parametrize("reduction", {"mean", "sum", "batchmean"})
@pytest.mark.parametrize("log_target", {True, False})
def test_KLDiv(reduction, log_target):
    torch.manual_seed(42)

    # 2D Tensors to test batchmean
    target = torch.empty(3, 10).uniform_(-1, 1)
    input = torch.randn(3, 10)

    loss_harness(F.kl_div, [input], target, reduction, log_target=log_target)


@pytest.mark.parametrize("reduction", {"mean", "sum"})
@pytest.mark.parametrize("log_input", {True, False})
@pytest.mark.parametrize("full", {True, False})
def test_PoissonNLLLoss(reduction, log_input, full):
    torch.manual_seed(42)

    target = torch.poisson(torch.rand(10) * 5)
    input = torch.empty(10).uniform_()

    loss_harness(F.poisson_nll_loss, [input],
                 target,
                 reduction,
                 full=full,
                 log_input=log_input)


@pytest.mark.parametrize("reduction", {"mean", "sum"})
def test_HingeEmbeddingLoss(reduction):
    torch.manual_seed(42)

    delta = torch.rand(1) + 0.5

    # Generate random set of 1s and -1s for labels
    target = torch.randint(2, [10]) * 2 - 1
    input = torch.empty(10).uniform_()

    loss_harness(F.hinge_embedding_loss, [input],
                 target,
                 reduction,
                 margin=delta.item())


torch.manual_seed(42)
params_bcewithlogits = [
    (
        torch.rand(10, 3),  # Inputs
        torch.empty(10, 3).uniform_(),  # Targets
        torch.rand(10, 3),  # Weights
        torch.rand(3)  # Pos Weights
    ),
    # Numerical stability test
    (torch.tensor([88.0]), torch.tensor([0.5]), None, None)
]


@pytest.mark.parametrize("reduction", {"mean", "sum"})
@pytest.mark.parametrize("input, target, weight, pos_weight",
                         params_bcewithlogits)
def test_BCEWithLogitsLoss(reduction, input, target, weight, pos_weight):

    loss_harness(F.binary_cross_entropy_with_logits, [input],
                 target,
                 reduction,
                 weight=weight,
                 pos_weight=pos_weight)


@pytest.mark.parametrize("reduction", {"mean", "sum"})
def test_SmoothL1Loss(reduction):
    torch.manual_seed(42)

    input = torch.randn(10)
    target = torch.empty(10).uniform_()

    loss_harness(F.smooth_l1_loss, [input], target, reduction)


@pytest.mark.parametrize("reduction", {"mean", "sum"})
def test_SoftMarginLoss(reduction):
    torch.manual_seed(42)

    input = torch.empty(10).uniform_()
    # Generate random set of 1s and -1s for labels
    target = torch.randint(2, [10]) * 2 - 1

    loss_harness(F.soft_margin_loss, [input], target, reduction)


# TODO(T30688): Support MultiLabelSoftMarginLoss
@pytest.mark.skip()
@pytest.mark.parametrize("reduction", {"mean", "sum"})
@pytest.mark.parametrize("specify_weight", {True, False})
def test_MultiLabelSoftMarginLoss(reduction, specify_weight):
    torch.manual_seed(42)

    weight = torch.randn(3, 10) if specify_weight else None

    input = torch.empty(3, 10).uniform_()
    # Generate random set of 0s and 1s for labels
    target = torch.randint(2, [3, 10])

    loss_harness(F.multilabel_soft_margin_loss, [input],
                 target,
                 reduction,
                 weight=weight)


@pytest.mark.parametrize("reduction", {"mean", "sum"})
def test_CosineEmbeddingLoss(reduction):
    pytest.skip("TODO(T66165): Fails due to detach op pass")

    torch.manual_seed(42)

    # Margin should be between -1 and 1
    margin = torch.rand(1) * 2 - 1

    input1 = torch.empty(10, 3).uniform_()
    input2 = torch.empty(10, 3).uniform_()

    # Generate random set of 1s and -1s for labels
    target = torch.randint(2, [10]) * 2 - 1

    loss_harness(F.cosine_embedding_loss, [input1, input2],
                 target,
                 reduction,
                 margin=margin.item())


@pytest.mark.parametrize("reduction", {"mean", "sum"})
def test_MarginRankingLoss(reduction):
    torch.manual_seed(42)

    # Margin should be between -1 and 1
    margin = torch.rand(1) * 2 - 1

    # As per the current PyTorch implementation, both dims must be equal
    input1 = torch.empty(10, 10).uniform_()
    input2 = torch.empty(10, 10).uniform_()

    # Generate random set of 1s and -1s for labels
    target = torch.randint(2, [10, 10]) * 2 - 1

    loss_harness(F.margin_ranking_loss, [input1, input2],
                 target,
                 reduction,
                 margin=margin.item())


@pytest.mark.parametrize("p", {2., 3.})
@pytest.mark.parametrize("swap", {True, False})
@pytest.mark.parametrize("reduction", {"mean", "sum"})
def test_TripletMarginLoss(p, swap, reduction):
    torch.manual_seed(42)

    # Between 0 and 2
    margin = torch.rand(1) * 2

    anchor = torch.randn(10, 5)
    positive = torch.randn(10, 5)
    negative = torch.randn(10, 5)

    loss_harness(F.triplet_margin_loss, [anchor, positive, negative],
                 None,
                 reduction,
                 margin=margin.item(),
                 p=p,
                 swap=swap)


@pytest.mark.parametrize("blank", {0, 3})
@pytest.mark.parametrize("reduction", {"mean", "sum"})
@pytest.mark.parametrize("zero_infinity", [True, False])
@pytest.mark.parametrize("lengths_are_tensors", [True, False])
def test_CTCLoss(blank, reduction, zero_infinity, lengths_are_tensors):
    T = 10  # Input sequence length
    N = 4  # Batch size
    C = 5  # Number of classes
    S = 6 if not zero_infinity else 10  # Target sequence length
    S_min = 3  # Minimum target length

    torch.manual_seed(42)

    # Initialize random batch of input vectors, for *size = (T,N,C)
    input = torch.randn(T, N, C).log_softmax(-1).detach()

    if lengths_are_tensors:
        input_lengths = torch.full(size=(N, ), fill_value=T, dtype=torch.long)
        target_lengths = torch.randint(low=S_min,
                                       high=S,
                                       size=(N, ),
                                       dtype=torch.long)
    else:
        input_lengths = [T] * N
        target_lengths = [random.randint(S_min, S - 1) for _ in range(N)]

    # Initialize random batch of targets (0..C excluding the blank class)
    target = torch.randint(low=0, high=C - 1, size=(N, S), dtype=torch.long)
    target[target > blank] += 1

    loss_harness(F.ctc_loss, [input],
                 target,
                 reduction,
                 input_lengths=input_lengths,
                 target_lengths=target_lengths,
                 blank=blank,
                 zero_infinity=zero_infinity)


@pytest.mark.parametrize("reduction", ("mean", "sum"))
def test_identity_with_linear_out_returned(reduction):
    torch.manual_seed(42)

    el_in = 2

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.lin = torch.nn.Linear(el_in, el_in)

        def forward(self, x):
            out = self.lin(x)
            loss = poptorch.identity_loss(out, reduction=reduction)
            return loss, out

    x = torch.rand(1, 1, el_in)

    model = Model()
    native_loss, native_out = model(x)

    poptorch_model = poptorch.trainingModel(model)
    poptorch_loss, poptorch_out = poptorch_model(x)

    helpers.assert_allclose(actual=poptorch_loss, expected=native_loss)
    helpers.assert_allclose(actual=poptorch_out, expected=native_out)

    assert native_loss.shape != native_out.shape


================================================
FILE: tests/lstm_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import torch
import torch.nn as nn
import helpers
import poptorch


def test_lstm():
    torch.manual_seed(42)
    lstm = nn.LSTM(3, 3)
    ipuLstm = poptorch.inferenceModel(lstm)
    inputs = [torch.randn(1, 3) for _ in range(5)]
    # initialize the hidden state.
    hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3))
    for i in inputs:
        # Step through the sequence one element at a time.
        # after each step, hidden contains the hidden state.
        out, newHidden = lstm(i.view(1, 1, -1), hidden)
        ipuOut, ipuHidden = ipuLstm(i.view(1, 1, -1), hidden)
        helpers.assert_allclose(expected=newHidden[0], actual=ipuHidden[0])
        helpers.assert_allclose(expected=newHidden[1], actual=ipuHidden[1])
        helpers.assert_allclose(expected=out, actual=ipuOut)
        hidden = newHidden


def test_lstm2():
    torch.manual_seed(42)
    numHidden = 5
    inputSize = 3
    lstm = nn.LSTM(3, numHidden)
    ipuLstm = poptorch.inferenceModel(lstm)
    inputs = [torch.randn(1, inputSize) for _ in range(5)]
    # Add the extra 2nd dimension
    inputs = torch.cat(inputs).view(len(inputs), 1, -1)
    hidden = (torch.randn(1, 1, numHidden), torch.randn(1, 1, numHidden))
    out, newHidden = lstm(inputs, hidden)
    ipuOut, ipuHidden = ipuLstm(inputs, hidden)
    helpers.assert_allclose(expected=newHidden[0], actual=ipuHidden[0])
    helpers.assert_allclose(expected=newHidden[1], actual=ipuHidden[1])
    helpers.assert_allclose(expected=out, actual=ipuOut)


def test_lstm_twice():
    torch.manual_seed(42)
    numHidden = 5
    inputSize = 3
    lstm = nn.LSTM(3, numHidden)
    ipuLstm = poptorch.inferenceModel(lstm)
    inputs = [torch.randn(1, inputSize) for _ in range(5)]
    # Add the extra 2nd dimension
    inputs = torch.cat(inputs).view(len(inputs), 1, -1)
    hidden = (torch.randn(1, 1, numHidden), torch.randn(1, 1, numHidden))
    out, newHidden = lstm(inputs, hidden)
    ipuOut, ipuHidden = ipuLstm(inputs, hidden)
    helpers.assert_allclose(expected=newHidden[0], actual=ipuHidden[0])
    helpers.assert_allclose(expected=newHidden[1], actual=ipuHidden[1])
    helpers.assert_allclose(expected=out, actual=ipuOut)

    out, newHidden = lstm(inputs, hidden)
    ipuOut2, ipuHidden2 = ipuLstm(inputs, hidden)
    helpers.assert_allclose(expected=newHidden[0], actual=ipuHidden2[0])
    helpers.assert_allclose(expected=newHidden[1], actual=ipuHidden2[1])
    helpers.assert_allclose(expected=out, actual=ipuOut2)
    helpers.assert_allclose(expected=ipuOut, actual=ipuOut2)


def test_lstm_batch_first():
    torch.manual_seed(42)
    numHidden = 5
    inputSize = 3
    lstm = nn.LSTM(3, numHidden, batch_first=True)
    ipuLstm = poptorch.inferenceModel(lstm)
    inputs = [torch.randn(1, inputSize) for _ in range(5)]
    # Add the extra 2nd dimension
    inputs = torch.cat(inputs).view(1, len(inputs), -1)
    hidden = (torch.randn(1, 1, numHidden), torch.randn(1, 1, numHidden))
    out, newHidden = lstm(inputs, hidden)
    ipuOut, ipuHidden = ipuLstm(inputs, hidden)
    helpers.assert_allclose(expected=newHidden[0], actual=ipuHidden[0])
    helpers.assert_allclose(expected=newHidden[1], actual=ipuHidden[1])
    helpers.assert_allclose(expected=out, actual=ipuOut)


def test_lstm_batched():
    torch.manual_seed(42)
    numHidden = 5
    inputSize = 3
    batch = 4
    lstm = nn.LSTM(3, numHidden)
    ipuLstm = poptorch.inferenceModel(lstm)
    inputs = [torch.randn(batch, inputSize) for _ in range(5)]
    # Add the extra 2nd dimension
    inputs = torch.cat(inputs).view(len(inputs), batch, -1)
    print(inputs.shape)
    hidden = (torch.randn(1, batch,
                          numHidden), torch.randn(1, batch, numHidden))
    out, newHidden = lstm(inputs, hidden)
    ipuOut, ipuHidden = ipuLstm(inputs, hidden)
    helpers.assert_allclose(expected=newHidden[0], actual=ipuHidden[0])
    helpers.assert_allclose(expected=newHidden[1], actual=ipuHidden[1])
    helpers.assert_allclose(expected=out, actual=ipuOut)


def test_lstm_batched_batch_first():
    torch.manual_seed(42)
    numHidden = 5
    inputSize = 3
    batch = 4
    lstm = nn.LSTM(3, numHidden, batch_first=True)
    ipuLstm = poptorch.inferenceModel(lstm)
    inputs = [torch.randn(batch, inputSize) for _ in range(5)]
    # Add the extra 2nd dimension
    inputs = torch.cat(inputs).view(batch, len(inputs), -1)
    hidden = (torch.randn(1, batch,
                          numHidden), torch.randn(1, batch, numHidden))
    out, newHidden = lstm(inputs, hidden)
    ipuOut, ipuHidden = ipuLstm(inputs, hidden)
    helpers.assert_allclose(expected=newHidden[0], actual=ipuHidden[0])
    helpers.assert_allclose(expected=newHidden[1], actual=ipuHidden[1])
    helpers.assert_allclose(expected=out, actual=ipuOut)


def test_lstm_fc():
    torch.manual_seed(42)

    batch_size = 2
    input_size = 5

    op = nn.LSTM(input_size, hidden_size=3, num_layers=1, bias=True)

    input = torch.randn(1, batch_size, input_size)
    out_fn = lambda x: x[0]
    model = helpers.ModelWithWeights(op, input.shape, out_fn)

    poptorch_model = poptorch.trainingModel(model)

    (native_out, (native_hn, native_cn)), _ = model((input, ))
    (poptorch_out, (poptorch_hn, poptorch_cn)), _ = poptorch_model((input, ))

    # Inference test - check outputs
    helpers.assert_allclose(actual=poptorch_out, expected=native_out)
    helpers.assert_allclose(actual=poptorch_hn, expected=native_hn)
    helpers.assert_allclose(actual=poptorch_cn, expected=native_cn)

    # Training test - check weights have changed
    poptorch_model.assert_weights_changed()


================================================
FILE: tests/math_ops_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import unittest

import torch
import pytest
import helpers
import poptorch

non_differentiable_ops = [
    torch.ceil, torch.floor, torch.round, torch.sign, torch.trunc,
    torch.argmax, torch.argmin, torch.remainder, torch.floor_divide
]


def op_harness(op, inputs, assert_func, test_training=False, out_fn=None):
    is_unary = len(inputs) == 1
    if not is_unary:
        assert len(inputs) == 2

    if test_training and not op in non_differentiable_ops:
        model = helpers.ModelWithWeights(op, inputs[0].shape, out_fn)

        # Run on CPU.
        native_out, _ = model(tuple(inputs))

        # The LR should be large enough that a single training step will
        # definitely cause weights to change
        optim = torch.optim.AdamW(model.parameters(), lr=0.1)

        # Run on IPU.
        poptorch_model = poptorch.trainingModel(model, optimizer=optim)
        poptorch_out, _ = poptorch_model(tuple(inputs))

        # Training test - check weights have changed
        poptorch_model.assert_weights_changed()
    else:

        class Model(torch.nn.Module):
            def __init__(self, op):
                super().__init__()
                self.op = op

        if is_unary:
            Model.forward = lambda self, x: self.op(x)
        else:
            Model.forward = lambda self, x, y: self.op(x, y)

        model = Model(op)

        # Run on CPU.
        native_out = model(*inputs)

        # Run on IPU.
        poptorch_model = poptorch.inferenceModel(model)
        poptorch_out = poptorch_model(*inputs)

    assert_func(native_out, poptorch_out)


unary_ops_float = [
    torch.abs,
    torch.acos,
    torch.acosh,
    torch.asin,
    torch.asinh,
    torch.atan,
    torch.atanh,
    # torch.angle,
    torch.ceil,
    torch.cos,
    torch.cosh,
    # torch.conj, torch.digamma
    torch.erf,
    torch.erfc,
    #torch.erfinv,
    torch.exp,
    torch.expm1,
    torch.floor,
    torch.frac,
    # torch.imag, torch.lgamma,
    torch.log,
    torch.log10,
    torch.log1p,
    torch.log2,
    # torch.logical_not, torch.mvlgamma,
    torch.neg,
    # torch.real,
    torch.reciprocal,
    torch.round,
    torch.rsqrt,
    torch.sigmoid,
    torch.sign,
    torch.sin,
    torch.sinh,
    torch.sqrt,
    torch.square,
    torch.tan,
    torch.tanh,
    torch.trunc,
]


@pytest.mark.parametrize("op", unary_ops_float)
def test_unary_ops_float(op):
    torch.manual_seed(42)

    input = torch.randn([1, 2, 10, 10])

    def assert_(native_out, poptorch_out):
        helpers.assert_allclose(expected=native_out,
                                actual=poptorch_out,
                                atol=1e-03,
                                rtol=1e-03,
                                equal_nan=True)

    op_harness(op, [input], assert_, test_training=True)


@pytest.mark.parametrize("inplace", [True, False])
@pytest.mark.parametrize("exponent", [4.0, 3, 2.5])
def test_binary_pow(inplace, exponent):
    torch.manual_seed(42)
    input = torch.randn([1, 2, 10, 200])

    def assert_(native_out, poptorch_out):
        helpers.assert_allclose(actual=poptorch_out,
                                expected=native_out,
                                equal_nan=True)

    def op(x):
        if inplace:
            # Although inplace would work, the native and poptorch output will
            # naturally not match as the input is changed
            x = x + 0
            return x.pow_(exponent)
        return torch.pow(x, exponent)

    op_harness(op, [input], assert_)


unary_ops_int = [
    torch.bitwise_not,
]


@pytest.mark.parametrize("op", unary_ops_int)
def test_unary_ops_int(op):
    torch.manual_seed(42)

    input = torch.randint(-1000, 1000, [1, 2, 10, 200])

    def assert_(native_out, poptorch_out):
        helpers.assert_allequal(actual=poptorch_out, expected=native_out)

    op_harness(op, [input], assert_)


unary_ops_bool = [
    torch.bitwise_not,
]


@pytest.mark.parametrize("op", unary_ops_bool)
def test_unary_ops_bool(op):
    torch.manual_seed(42)

    input = torch.randint(2, [1, 2, 10, 200]) > 0

    def assert_(native_out, poptorch_out):
        helpers.assert_allequal(actual=poptorch_out, expected=native_out)

    op_harness(op, [input], assert_)


# Parameterize torch.clamp unit tests for different supported overloads
clamp_inputs = [{
    "min": 0.2,
    "max": 0.8
}, {
    "min": 0.2
}, {
    "max": 0.8
}, {
    "min": 0.8,
    "max": 0.2
}]


@pytest.mark.parametrize("args", clamp_inputs)
def test_clamp(args):
    torch.manual_seed(42)

    input = torch.randn([1, 2, 10, 10])

    def op_clamp(x):
        return x.clamp(**args)

    def assert_(native_out, poptorch_out):
        helpers.assert_allclose(actual=poptorch_out, expected=native_out)

    op_harness(op_clamp, [input], assert_, test_training=True)


@pytest.mark.parametrize("args", clamp_inputs)
def test_clamp_(args):
    torch.manual_seed(42)

    input = torch.randn([1, 2, 10, 10])

    def op_clamp_(x):
        return x.clamp_(**args)

    def assert_(native_out, poptorch_out):
        helpers.assert_allclose(actual=poptorch_out, expected=native_out)

    op_harness(op_clamp_, [input], assert_, test_training=True)


@pytest.mark.parametrize("args", clamp_inputs)
def test_clamp_mul_exp(args):
    torch.manual_seed(42)

    t = torch.randn([1, 2, 10, 10], dtype=torch.float16)

    class Model(torch.nn.Module):
        def forward(self, x):
            x = x.clamp(**args)
            x = torch.exp(0.5 * x)
            return x

    model = Model()
    ipu_model = poptorch.inferenceModel(model)

    actual_out = ipu_model(t)
    expected_out = model(t.to(torch.float32))
    helpers.assert_allclose(actual=actual_out, expected=expected_out)


@pytest.mark.parametrize(
    "op",
    [torch.clamp_min, torch.clamp_min_, torch.clamp_max, torch.clamp_max_])
def test_clamp_min_max(op):
    torch.manual_seed(42)

    magnitude = 1
    input = torch.randn(1, 2, 10, 10) * magnitude

    def op_clamp(x):
        return op(x, magnitude * 0.75)

    def assert_(native_out, poptorch_out):
        helpers.assert_allclose(actual=poptorch_out, expected=native_out)

    op_harness(op_clamp, [input], assert_, test_training=True)


@pytest.mark.parametrize(
    "op",
    [torch.clamp_min, torch.clamp_min_, torch.clamp_max, torch.clamp_max_])
def test_clamp_min_max_tensor(op):
    torch.manual_seed(42)

    magnitude = 1
    input = torch.randn(1, 2, 10, 10) * magnitude

    def op_clamp(x):
        return op(x, torch.tensor(magnitude * 0.75))

    def assert_(native_out, poptorch_out):
        helpers.assert_allclose(actual=poptorch_out, expected=native_out)

    op_harness(op_clamp, [input], assert_, test_training=True)


clamp_int_inputs = [
    {
        "min": -4.5,
        "max": 5.5
    },
    {
        "min": -4.5
    },
    {
        "max": 5.5
    },
    {
        "min": -5,
        "max": 5
    },
    {
        "min": -5
    },
    {
        "max": 5
    },
]


@pytest.mark.parametrize("args", clamp_int_inputs)
def test_clamp_int(args):
    torch.manual_seed(42)

    t = torch.randint(-100, 100, (100, ))

    class Model(torch.nn.Module):
        def forward(self, x):
            return torch.clamp(x, **args)

    model = Model()
    ipu_model = poptorch.inferenceModel(model)

    helpers.assert_allequal(actual=ipu_model(t), expected=model(t))


binary_ops_float = [
    torch.add, torch.atan2, torch.div, torch.sub, torch.fmod,
    torch.floor_divide, torch.mul, torch.remainder, torch.true_divide
]


@pytest.mark.parametrize("op", binary_ops_float)
def test_binary_ops_float(op):
    torch.manual_seed(42)

    input1 = torch.randn([1, 2, 5, 1]) * 100.0
    input2 = torch.randn([1, 2, 5, 1]) * 10.0

    def assert_(native_out, poptorch_out):
        helpers.assert_allclose(actual=poptorch_out,
                                expected=native_out,
                                atol=1e-05,
                                rtol=1e-05,
                                equal_nan=True)

    op_harness(op, [input1, input2], assert_, test_training=True)


binary_ops_basic_element_wise_float = [
    torch.add,
    torch.div,
    torch.sub,
    torch.mul,
]


@pytest.mark.parametrize("op", binary_ops_basic_element_wise_float)
def test_binary_ops_elementwise_edgecases(op):
    torch.manual_seed(42)
    input1 = torch.randn([1, 2, 10, 10])
    input2 = torch.randn([1])

    def assert_(native_out, poptorch_out):
        helpers.assert_allclose(actual=poptorch_out,
                                expected=native_out,
                                atol=1e-04,
                                rtol=1e-04,
                                equal_nan=True)

    class Model(torch.nn.Module):
        def __init__(self, op):
            super().__init__()
            self.op = op

    # Constant on LHS
    Model.forward = lambda self, x, _y: self.op(x, 4.0)
    op_harness(Model(op), [input1, input2], assert_, test_training=True)

    # Constant on RHS
    Model.forward = lambda self, x, _y: self.op(2.5, x)
    op_harness(Model(op), [input1, input2], assert_, test_training=True)

    # Constant on LHS wrong type.
    Model.forward = lambda self, x, _y: self.op(x, 4)
    op_harness(Model(op), [input1, input2], assert_, test_training=True)

    # Constant on RHS wrong type
    Model.forward = lambda self, x, _y: self.op(134, x)
    op_harness(Model(op), [input1, input2], assert_, test_training=True)


binary_ops_basic_element_wise_bool = [
    torch.add,
    torch.mul,
]


@pytest.mark.parametrize("op", binary_ops_basic_element_wise_bool)
def test_binary_ops_elementwise_bools(op):
    input1 = torch.tensor([False, True, False, True])
    input2 = torch.tensor([False, False, True, True])

    def assert_(native_out, poptorch_out):
        helpers.assert_allequal(actual=poptorch_out, expected=native_out)

    class Model(torch.nn.Module):
        def __init__(self, op):
            super().__init__()
            self.op = op

    # Both bools
    Model.forward = lambda self, x, y: self.op(x, y)
    op_harness(Model(op), [input1, input2], assert_)

    # Float on LHS
    Model.forward = lambda self, x, y: self.op(x.to(torch.float) + 1.0, y)
    op_harness(Model(op), [input1, input2], assert_)

    # Float on RHS
    Model.forward = lambda self, x, y: self.op(x, y.to(torch.float) + 1.0)
    op_harness(Model(op), [input1, input2], assert_)

    # Int on LHS
    Model.forward = lambda self, x, y: self.op(x.to(torch.int) + 1, y)
    op_harness(Model(op), [input1, input2], assert_)

    # Int on RHS
    Model.forward = lambda self, x, y: self.op(x, y.to(torch.int) + 1)
    op_harness(Model(op), [input1, input2], assert_)


@pytest.mark.parametrize("op", [torch.fmod, torch.remainder])
def test_modulo_mixed_sign(op):
    input1 = torch.tensor([-4.3, 7.2, 5.0, 4.3, -7.2, 8.0])
    input2 = torch.tensor([2.1, -3.4, 8.0, -2.1, 3.4, 5.0])

    def assert_(native_out, poptorch_out):
        helpers.assert_allclose(actual=poptorch_out,
                                expected=native_out,
                                atol=1e-05,
                                rtol=1e-05,
                                equal_nan=True)

    op_harness(op, [input1, input2], assert_)


def __and__(x, y):
    return x & y


def __or__(x, y):
    return x | y


def __xor__(x, y):
    return x ^ y


binary_op_int = [
    torch.bitwise_and, torch.bitwise_or, torch.bitwise_xor, __and__, __or__,
    __xor__
]


@pytest.mark.parametrize("op", binary_op_int)
def test_binary_int_ops(op):
    input1 = torch.tensor([-4, 7, 5, 4, -7, 8], dtype=torch.int)
    input2 = torch.tensor([2, -3, 8, -2, 3, 5], dtype=torch.int)

    def assert_(native_out, poptorch_out):
        helpers.assert_allclose(actual=poptorch_out,
                                expected=native_out,
                                atol=1e-05,
                                rtol=1e-05,
                                equal_nan=True)

    op_harness(op, [input1, input2], assert_)


# Poplar doesn't support binary ops on 8-bit integral types, but test we can
# pass the rest of them.
@pytest.mark.parametrize("dtype", [torch.int16, torch.int32, torch.int64])
def test_binary_int_op_types(dtype):
    input1 = torch.tensor([-4, 7, 5, 4, -7, 8], dtype=dtype)
    input2 = torch.tensor([2, -3, 8, -2, 3, 5], dtype=dtype)

    def assert_(native_out, poptorch_out):
        helpers.assert_allclose(actual=poptorch_out,
                                expected=native_out,
                                atol=1e-05,
                                rtol=1e-05,
                                equal_nan=True)

    op_harness(torch.bitwise_and, [input1, input2], assert_)


binary_op_bool = [
    torch.bitwise_and,
    torch.bitwise_or,
    # torch.bitwise_xor, TODO(T43716)
    torch.logical_and,
    torch.logical_or,
    #torch.logical_xor TODO(T43716)
]


@pytest.mark.parametrize("op", binary_op_bool)
def test_binary_bool_ops(op):
    input1 = torch.tensor([-4, 7, 5, 4, -7, 8]) > 0
    input2 = torch.tensor([2, -3, 8, -2, 3, 5]) > 0

    def assert_(native_out, poptorch_out):
        helpers.assert_allclose(actual=poptorch_out,
                                expected=native_out,
                                atol=1e-05,
                                rtol=1e-05,
                                equal_nan=True)

    op_harness(op, [input1, input2], assert_)


# These functions support API 1 - op(input)
reduction_ops_api1 = [
    torch.max,
    torch.min,
    torch.amax,
    torch.amin,
    torch.argmax,
    torch.argmin,
    # torch.dist,
    torch.mean,
    torch.median,
    # torch.mode,
    torch.linalg.norm,
    torch.prod,
    #torch.std, torch.std_mean,
    torch.sum,
    #torch.unique, torch.unique_consecutive,torch.var, torch.var_mean,
]

# These functions support API 2 - op(input,dim,keep_dim)
reduction_ops_api2 = [
    torch.max,
    torch.min,
    torch.amax,
    torch.amin,
    torch.argmax,
    torch.argmin,
    # torch.dist,
    torch.mean,
    torch.median,
    # torch.mode,
    torch.linalg.norm,
    torch.prod,
    torch.logsumexp,  # logsumexp doesn't support API 1.
    #torch.std, torch.std_mean,
    torch.sum,
    #torch.unique, torch.unique_consecutive,torch.var, torch.var_mean,
]


@pytest.mark.parametrize("op", reduction_ops_api1)
def test_reduction_ops_float(op):
    torch.manual_seed(42)

    input = torch.randn([1, 2, 10, 10])

    def assert_(native_out, poptorch_out):
        poptorch_out = poptorch_out.reshape(native_out.shape)
        if native_out.dtype == torch.float32:
            helpers.assert_allclose(actual=poptorch_out,
                                    expected=native_out,
                                    atol=1e-05,
                                    rtol=1e-05,
                                    equal_nan=True)
        else:
            helpers.assert_allequal(actual=poptorch_out, expected=native_out)

    op_harness(op, [input], assert_, test_training=True)


@pytest.mark.parametrize("op", reduction_ops_api2)
@pytest.mark.parametrize("dim", range(4))
@pytest.mark.parametrize("keepdim", [False, True])
def test_reduction_ops_float_api2(op, dim, keepdim):
    torch.manual_seed(42)

    input = torch.randn([1, 2, 10, 10])

    def operation(x):
        return op(x, dim=dim, keepdim=keepdim)

    # Whether op returns both values and indices with API 2.
    returns_tuple = op in [torch.max, torch.min, torch.median]

    def assert_(native_out, poptorch_out):
        if returns_tuple:
            helpers.assert_allclose(actual=poptorch_out[0],
                                    expected=native_out.values)
            helpers.assert_allequal(actual=poptorch_out[1].to(torch.int64),
                                    expected=native_out.indices)
        elif native_out.dtype == torch.float32:
            helpers.assert_allclose(actual=poptorch_out, expected=native_out)
        elif torch.numel(native_out) > 1:
            # Work around not returning longs from popart.
            helpers.assert_allequal(actual=poptorch_out.to(torch.int64),
                                    expected=native_out)
        else:
            helpers.assert_allequal(actual=poptorch_out, expected=native_out)

    # This check must be repeated here because we need to check the op before we
    # wrap the function otherwise it won't match in the test harness
    test_training = not op in non_differentiable_ops
    out_fn = (lambda x: x.values) if returns_tuple else None
    op_harness(operation, [input],
               assert_,
               test_training=test_training,
               out_fn=out_fn)


@pytest.mark.parametrize("op", [torch.min, torch.max])
@pytest.mark.parametrize("dim", range(3))
@pytest.mark.parametrize("keepdim", [False, True])
def test_minmax_tuple_out(op, dim, keepdim):
    torch.manual_seed(42)

    input = torch.randn([1, 2, 10, 10])

    def operation(x):
        return op(x, dim=dim, keepdim=keepdim)

    def assert_(native_out, poptorch_out):
        assert isinstance(native_out, tuple) and isinstance(
            poptorch_out, tuple)
        assert len(native_out) == len(poptorch_out)
        for i, native in enumerate(native_out):
            helpers.assert_allclose(actual=poptorch_out[i], expected=native)

    out_fn = lambda x: x.values
    op_harness(operation, [input], assert_, test_training=True, out_fn=out_fn)


# Interesting p-values for testing torch.linalg.norm(X, p=<>)
norm_pvals = [
    'fro',
    float('inf'),
    float('-inf'),
    1,
    1.0,
    -1,
    # 2, 2.0, -2, 'nuc' Unsupported
]


@pytest.mark.parametrize("p", norm_pvals)
def test_norm_p_values(p):
    torch.manual_seed(42)
    input = torch.randn([2, 10])

    def operation(x):
        return torch.linalg.norm(x, ord=p)

    def assert_(native_out, poptorch_out):
        helpers.assert_allclose(actual=poptorch_out, expected=native_out)

    op_harness(operation, [input], assert_, test_training=True)


def test_norm_dtype():
    torch.manual_seed(42)
    input = torch.randn([2, 10])

    def operation(x):
        return torch.linalg.norm(x, dtype=torch.float, ord="fro")

    def assert_(native_out, poptorch_out):
        helpers.assert_allclose(actual=poptorch_out, expected=native_out)

    op_harness(operation, [input], assert_, test_training=True)


comparison_ops = [
    # torch.allclose,     # Not supported in trace, seems to get optimized out.
    # torch.argsort,     # Not in Onnx. TODO(T23319)
    torch.eq,
    # torch.equal,       # Not supported as the return of trace in JIT.
    torch.ge,
    torch.gt,
    # torch.kthvalue,     # Not in Onnx.
    torch.le,
    torch.lt,
    torch.max,
    torch.min,
    torch.ne,
]


@pytest.mark.parametrize("op", comparison_ops)
def test_compare_operations(op):
    torch.manual_seed(42)

    lhs = torch.randn([1, 2, 10, 200])
    rhs = torch.randn([1, 2, 10, 200])

    indices = torch.randint(0, 200, [30])

    # Make a few of the indices equal.
    for i in indices:
        lhs[0][0][0][i] = rhs[0][0][0][i]

    def assert_(native_out, poptorch_out):
        helpers.assert_allequal(actual=poptorch_out, expected=native_out)

    op_harness(op, [lhs, rhs], assert_)

    if op not in (torch.min, torch.max):
        constant_rhs = lambda x: op(x, 0.34)
        op_harness(constant_rhs, [lhs], assert_)


comparison_unity_nan_inf_ops = [
    # torch.isfinite, torch.isinf,  # Not in Onnx
    torch.isnan,
]


@pytest.mark.parametrize("op", comparison_unity_nan_inf_ops)
def test_compare_unity_nan_inf_ops(op):
    torch.manual_seed(42)

    input = torch.tensor([
        1.0,
        float('inf'), 2.0,
        float('-inf'),
        float('nan'),
        float('-nan'), 13.0
    ])

    def assert_(native_out, poptorch_out):
        helpers.assert_allequal(actual=poptorch_out, expected=native_out)

    op_harness(op, [input], assert_)


comparison_unity = [torch.max, torch.min]


@pytest.mark.parametrize("op", comparison_unity)
def test_compare_unity_operations(op):
    torch.manual_seed(42)
    input = torch.randn([1, 2, 10, 10])

    def operation(x):
        return op(x)

    def assert_(native_out, poptorch_out):
        helpers.assert_allclose(actual=poptorch_out, expected=native_out)

    op_harness(operation, [input], assert_, test_training=True)


@pytest.mark.parametrize("largest", [True, False])
def test_topk(largest):
    torch.manual_seed(42)
    input = torch.randn([1, 2, 10, 10])

    def operation(x):
        return torch.topk(x, k=10, dim=-1, largest=largest)

    def assert_(native_out, poptorch_out):
        helpers.assert_allclose(actual=poptorch_out[0],
                                expected=native_out.values)
        helpers.assert_allequal(actual=poptorch_out[1],
                                expected=native_out.indices)

    out_fn = lambda x: x.values
    op_harness(operation, [input], assert_, test_training=True, out_fn=out_fn)


@pytest.mark.parametrize("shape", [(17, 4), (18, 23, 5)])
@pytest.mark.parametrize("descending", [True, False])
@unittest.mock.patch.dict("os.environ", helpers.disableSmallModel())
def test_sort(shape, descending):
    torch.manual_seed(42)
    input = torch.randn(*shape)

    def operation(x):
        return torch.sort(x, descending=descending)

    def assert_(native_out, poptorch_out):
        helpers.assert_allclose(actual=poptorch_out[0],
                                expected=native_out.values)
        helpers.assert_allequal(actual=poptorch_out[1],
                                expected=native_out.indices)

    out_fn = lambda x: x.values
    op_harness(operation, [input], assert_, test_training=True, out_fn=out_fn)


@pytest.mark.parametrize("descending", [True, False])
@unittest.mock.patch.dict("os.environ", helpers.disableSmallModel())
def test_sort_stable(descending):
    torch.manual_seed(42)
    input = torch.tensor([[2.0, 2.0, 1.0, 10.0, 11.0],
                          [2.0, 15.0, 15.0, 10.0, 11.0]])

    def operation(x):
        return torch.sort(x, descending=descending, stable=True)

    def assert_(native_out, poptorch_out):
        helpers.assert_allclose(actual=poptorch_out[0],
                                expected=native_out.values)
        helpers.assert_allequal(actual=poptorch_out[1],
                                expected=native_out.indices)

    out_fn = lambda x: x.values
    op_harness(operation, [input], assert_, test_training=True, out_fn=out_fn)


def test_bincount():
    torch.manual_seed(42)
    input_size = 7
    input = torch.randint(0, 8, (input_size, ), dtype=torch.int64)

    def operation(x):
        return torch.bincount(x, minlength=input_size + 1)

    def assert_(native_out, poptorch_out):
        helpers.assert_allequal(actual=poptorch_out, expected=native_out)

    op_harness(operation, [input], assert_, test_training=False)


def test_bincount_error():
    torch.manual_seed(42)
    input_size = 7
    input = torch.randint(0, 8, (input_size, ), dtype=torch.int64)

    def operation(x):
        return torch.bincount(x)

    def assert_(native_out, poptorch_out):
        helpers.assert_allequal(actual=poptorch_out, expected=native_out)

    with pytest.raises(
            poptorch.poptorch_core.Error,
            match=
            "Bincount `minlength` must be specified and must be a constant. "
            "On the IPU MK2 platform the minimum length is also the "
            "maximum length"):
        op_harness(operation, [input], assert_, test_training=False)


def test_bincount_weights():
    torch.manual_seed(42)
    input_size = 7
    input = torch.randint(0, 8, (input_size, ), dtype=torch.int64)

    def operation(x):
        weights = torch.linspace(0, 1, steps=input_size)
        return torch.bincount(x, weights, minlength=input_size + 1)

    def assert_(native_out, poptorch_out):
        helpers.assert_allequal(actual=poptorch_out, expected=native_out)

    op_harness(operation, [input], assert_, test_training=False)


types = [torch.float32, torch.int32]


@pytest.mark.parametrize("ty", types)
def test_constant_arrays(ty):
    torch.manual_seed(42)

    input = torch.randn([10]).to(ty)

    def operation(x):
        constant_tensor = torch.tensor([1, -2, -3, 4, 5, 6, 7, -8, 9, -10],
                                       dtype=ty)
        return torch.sub(x, constant_tensor)

    def assert_(native_out, poptorch_out):
        helpers.assert_allequal(actual=poptorch_out, expected=native_out)

    op_harness(operation, [input], assert_)


@pytest.mark.parametrize("ty", types)
def test_big_constant_arrays_sliced(ty):
    torch.manual_seed(42)

    input = torch.randn([1]).to(ty)

    def operation(x):
        big_array = torch.tensor(
            [[
                155, 229, 322, 453, 655, 888, 1128, 1694, 2036, 2502, 3089,
                3858, 4636, 5883, 7375, 9172, 10149, 12462, 15113, 17660,
                21157, 24747, 27980, 31506, 35713, 41035, 47021, 43, 59138,
                63927, 69176, 74386, 80589, 86498, 92472, 97689, 45, -424, 5,
                6, 435, 124632, 128948, 132547, 135586, 42, 5, 147577, 5
            ],
             [
                 2, 1, 1, 3, 45, 46, 46, 83, 149, 160, 276, 414, 523, 589, 622,
                 724, 724, 1045, 1045, 1439, 24, 2335, 2749, 2941, 4025, 4440,
                 4440, 24, 7024, 7024, 8326, 9362, 10361, 10950, 12384, 13030,
                 -8, 324, 425, 67, -245, -2425, 21815, 22837, 24392, 324, 234,
                 2435, 4325
             ],
             [
                 3, 7, 10, 12, 17, 21, 29, 34, 52, 79, 107, 148, 197, 233, 366,
                 463, 631, 827, -2344, -2, 1441, 1809, 2158, 2503, 2978, 3405,
                 4032, -324, 5664, 45, 53, -25, 8215, 9134, 10023, 10779,
                 -2345, 4, 13155, 5, 98754, 143535, 245232, 16523, 17127, 2,
                 42, 5, 19468
             ]],
            dtype=ty)
        return x * big_array[0]

    def assert_(native_out, poptorch_out):
        helpers.assert_allequal(actual=poptorch_out, expected=native_out)

    op_harness(operation, [input], assert_)


# Parametrize input tensor shapes for addcdiv to make sure broadcasting works.
broadcastable_shapes = [
    ((3, 1), (3, 1), (3, 1)),
    ((1, 3), (3, 1), (1, 3)),
    ((5, 3), (5, 1), (1, 3)),
    ((1, ), (3, 1), (2, )),
]


@pytest.mark.parametrize("shapes", broadcastable_shapes)
@pytest.mark.parametrize("scale", [0.35, 4.91, 12.0, -0.53, -3.45, -9.0, 0.0])
def test_addcdiv(shapes, scale):
    torch.manual_seed(42)

    class Model(torch.nn.Module):
        def forward(self, tensor0, tensor1, tensor2):
            return torch.addcdiv(
                tensor0,
                tensor1,
                tensor2,
                value=scale,
            )

    t0 = torch.randn(shapes[0])
    t1 = torch.randn(shapes[1])
    t2 = torch.randn(shapes[2])

    model = Model()
    native_out = model(t0, t1, t2)

    poptorch_model = poptorch.inferenceModel(model)
    poptorch_out = poptorch_model(t0, t1, t2)

    helpers.assert_allclose(actual=poptorch_out, expected=native_out)


cross_shapes = [(3, 4, 5, 6), (4, 3, 5, 6), (4, 5, 3, 6), (4, 5, 6, 3),
                (6, 3, 3, 5)]


@pytest.mark.parametrize("shape", cross_shapes)
def test_cross_shape(shape):
    torch.manual_seed(42)

    x = torch.randn(shape)
    y = torch.randn(shape)

    def assert_(native_out, poptorch_out):
        helpers.assert_allclose(actual=poptorch_out, expected=native_out)

    op_harness(torch.cross, [x, y], assert_, test_training=True)


@pytest.mark.parametrize("axis", range(0, 4))
def test_cross_axis(axis):
    torch.manual_seed(42)

    class Model(torch.nn.Module):
        def __init__(self, axis):
            super().__init__()
            self.axis = axis

        def forward(self, x, y):
            return torch.cross(x, y, self.axis)

    x = torch.randn(3, 3, 3, 3)
    y = torch.randn(3, 3, 3, 3)

    def assert_(native_out, poptorch_out):
        helpers.assert_allclose(actual=poptorch_out, expected=native_out)

    op_harness(Model(axis), [x, y], assert_, test_training=True)


@pytest.mark.parametrize(
    "params",
    [
        # dims?, unbiased
        (
            False, ),
        ([0, 1, -1], True)
    ])
@pytest.mark.parametrize(
    "op", [torch.var, torch.var_mean, torch.std, torch.std_mean])
def test_var_std(op, params):
    torch.manual_seed(42)

    x = torch.randn(3, 4, 5)
    model = lambda x: op(x, *params)

    def assert_(native_out, poptorch_out):
        helpers.assert_allclose(actual=poptorch_out, expected=native_out)

    op_harness(model, [x], assert_)


@pytest.mark.parametrize("axis", range(0, 4))
@pytest.mark.parametrize("descending", [True, False])
def test_argsort(axis, descending):
    torch.manual_seed(42)
    input = torch.randn([3, 4, 5, 5])

    def operation(x):
        return torch.argsort(x, dim=axis, descending=descending)

    def assert_(native_out, poptorch_out):
        helpers.assert_allclose(actual=poptorch_out, expected=native_out)

    op_harness(operation, [input], assert_)


def test_reciprocal_intergral_input():

    torch.manual_seed(42)
    input = torch.randint(256, size=(640, 480))

    def operation(original_sizes):
        image_size = 896

        ratio_image_size = (image_size /
                            torch.amax(original_sizes).unsqueeze(axis=-1))

        multiplication = (ratio_image_size * original_sizes)

        return ratio_image_size, multiplication

    def assert_(native_out, poptorch_out):
        helpers.assert_allclose(actual=poptorch_out, expected=native_out)

    op_harness(operation, [input], assert_)


================================================
FILE: tests/misc_nn_layers_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import torch
import torch.nn.functional as F
import pytest
import helpers
import poptorch

# Linears
# torch.nn.Identity, torch.nn.Linear, torch.nn.Bilinear,

# Dropouts
# torch.nn.Dropout, torch.nn.Dropout2d, torch.nn.Dropout3d, torch.nn.AlphaDropout,

# Sparse
# torch.nn.Embedding, torch.nn.Embedding.from_pretrained, torch.nn.EmbeddingBag, torch.nn.EmbeddingBag.from_pretrained,

include_bias = [True, False]


def op_harness(op, inputs, inference_test_fn=None):
    if inference_test_fn is None:
        inference_test_fn = lambda native_out, poptorch_out: helpers.assert_allclose(
            expected=native_out, actual=poptorch_out)

    model = helpers.ModelWithWeights(op, inputs[0].shape)

    # Run on CPU.
    native_out, _ = model(tuple(inputs))

    # Run on IPU.
    # Setup IPU seed
    opts = poptorch.Options()
    opts.randomSeed(torch.initial_seed())
    poptorch_model = poptorch.trainingModel(model, options=opts)
    poptorch_out, _ = poptorch_model(tuple(inputs))

    # Inference test - check outputs
    inference_test_fn(native_out, poptorch_out)

    # Training test - check weights changed
    poptorch_model.assert_weights_changed()


@pytest.mark.parametrize("scale_factor", [2, 3.5, 5.00001, 5.12498])
@pytest.mark.parametrize("input_shape", [(1, 2, 8), (2, 2, 2, 8),
                                         (2, 3, 4, 2, 8)])
def test_upsample_nearest(scale_factor, input_shape):
    torch.manual_seed(42)
    op = torch.nn.Upsample(scale_factor=scale_factor, mode="nearest")
    x = torch.randn(*input_shape)
    op_harness(op, [x])


def test_downsample_nearest():
    torch.manual_seed(42)
    # test case from T44610
    op = torch.nn.Upsample(scale_factor=0.435714, mode="nearest")
    x = torch.randn(1, 2, 14, 14)
    op_harness(op, [x])


# TODO(T43375): replace scale factor 5 with 3.5
@pytest.mark.parametrize("scale_factor", [2, 5])
@pytest.mark.parametrize("input_shape", [(1, 2, 3, 4), (2, 2, 2, 8)])
@pytest.mark.parametrize("align_corners", [True, False])
def test_upsample_bilinear_factor(scale_factor, input_shape, align_corners):
    torch.manual_seed(42)
    op = torch.nn.Upsample(scale_factor=scale_factor,
                           mode="bilinear",
                           align_corners=align_corners)
    x = torch.randn(*input_shape)
    op_harness(op, [x])


@pytest.mark.parametrize("shapes", [[(1, 2, 3, 4),
                                     (6, 8)], [(2, 2, 2, 8), (7, 28)]])
@pytest.mark.parametrize("align_corners", [True, False])
def test_upsample_bilinear_factor_shapes(shapes, align_corners):
    torch.manual_seed(42)
    op = torch.nn.Upsample(size=shapes[1],
                           mode="bilinear",
                           align_corners=align_corners)
    x = torch.randn(*shapes[0])
    op_harness(op, [x])


@pytest.mark.parametrize("shape", [(2, 2, 14, 14)])
def test_upsample_bicubic(shape):
    torch.manual_seed(42)
    model = torch.nn.Upsample(scale_factor=0.4357, mode='bicubic')
    x = torch.randn(*shape)

    # Run on CPU.
    native_out = model(x)

    # Run on IPU.
    poptorch_model = poptorch.inferenceModel(model)
    poptorch_out = poptorch_model(x)

    helpers.assert_allclose(expected=native_out, actual=poptorch_out)


@pytest.mark.parametrize("mode, input_shape", [("linear", (1, 2, 3)),
                                               ("trilinear", (1, 2, 3, 4, 5))])
def test_unsupported_upsample(mode, input_shape):
    torch.manual_seed(42)
    scale_factor = 2
    model = torch.nn.Upsample(scale_factor=scale_factor, mode=mode)
    x = torch.randn(*input_shape)

    # Run on IPU.
    poptorch_model = poptorch.inferenceModel(model)
    with pytest.raises(poptorch.Error, match="only 'nearest' is supported"):
        poptorch_model(x)


def test_linear():
    torch.manual_seed(42)
    model = torch.nn.Linear(20, 30)
    x = torch.randn(128, 20)

    # Run on CPU.
    native_out = model(x)

    # Run on IPU.
    poptorch_model = poptorch.inferenceModel(model)
    poptorch_out = poptorch_model(x)

    assert native_out.size() == poptorch_out.size()
    helpers.assert_allclose(expected=native_out, actual=poptorch_out)


@pytest.mark.parametrize("include_bias", include_bias)
@pytest.mark.parametrize("input_feature_shape", [(), (3, 4)])
def test_bilinear(include_bias, input_feature_shape):
    torch.manual_seed(42)
    op = torch.nn.Bilinear(10, 20, 30, bias=include_bias)
    x1 = torch.randn(8, *input_feature_shape, 10)
    x2 = torch.randn(8, *input_feature_shape, 20)
    op_harness(op, [x1, x2])


def test_identity():
    torch.manual_seed(42)
    op = torch.nn.Identity(20, 30, 40)
    x = torch.randn(128, 20)
    op_harness(op, [x])


dropout_ops = [torch.nn.Dropout, torch.nn.Dropout2d, torch.nn.Dropout3d]


@pytest.mark.parametrize("dropout_op", dropout_ops)
def test_dropout_inference(dropout_op):
    torch.manual_seed(42)
    model = dropout_op()
    model.eval()

    x = torch.randn(128, 20)

    # Run on CPU.
    native_out = model(x)

    # Run on IPU.
    poptorch_model = poptorch.inferenceModel(model)
    poptorch_out = poptorch_model(x)

    msg = f"{dropout_op.__name__} in inference session should equal identity."
    helpers.assert_allequal(expected=native_out, actual=poptorch_out, msg=msg)


@pytest.mark.parametrize("dropout_op", dropout_ops)
def test_dropout_eval_during_training(dropout_op):
    torch.manual_seed(42)
    dropout = dropout_op()
    dropout.eval()

    x = torch.randn(128, 20)

    # Create a model consisting of a single dropout operation
    # with a dummy parameter for the optimizer
    dropout.register_parameter('param', torch.nn.Parameter(torch.empty(10)))
    native_out = dropout(x)

    # Create a poptorch training model with a fixed random seed for deterministic runs
    # Note that the loss is irrelevant and ignored.
    class ModelWithLoss(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.dropout = dropout
            self.loss = torch.nn.L1Loss()

        def forward(self, data, target):
            out = self.dropout(data)
            loss = self.loss(out, target)
            return out, loss

    model = ModelWithLoss()
    poptorch_model = poptorch.trainingModel(model)
    dummy_label = torch.zeros_like(x)
    poptorch_out, _ = poptorch_model(x, dummy_label)

    assert native_out.size() == poptorch_out.size()
    msg = f"{dropout_op.__name__} should equal identity."
    helpers.assert_allequal(expected=x, actual=poptorch_out, msg=msg)


@pytest.mark.ipuHardwareRequired
def test_dropout_training():
    torch.manual_seed(42)
    drop_ratio = 0.8
    dropout_op = torch.nn.Dropout(drop_ratio)

    # Input size needs to be large enough for convergence to expected dropout ratio
    sz = [100, 4, 3]
    x = torch.ones(sz, dtype=torch.float)

    def check_ratio(_, poptorch_out):
        # Instead we test that poptorch converge to the expected dropout ratio
        actual_ratio = x[poptorch_out == 0].sum() / x.numel()
        helpers.assert_allclose(actual=actual_ratio,
                                expected=drop_ratio,
                                rtol=0.01,
                                atol=0.01)

    op_harness(dropout_op, [x], check_ratio)


@pytest.mark.ipuHardwareRequired
def test_dropout2d_training():
    torch.manual_seed(42)
    drop_ratio = 0.8
    dropout_op = torch.nn.Dropout2d(drop_ratio)

    # Input size needs to be large enough for convergence to expected dropout ratio
    N = 30
    C = 30
    num_channels = torch.as_tensor(N * C, dtype=torch.float)
    sz = [N, C, 2, 2]
    x = torch.ones(sz, dtype=torch.float)

    def check_ratio(_, poptorch_out):
        channel_mask = (poptorch_out == 0).all(-1).all(-1)
        actual_ratio = channel_mask.sum() / num_channels
        helpers.assert_allclose(actual=actual_ratio,
                                expected=drop_ratio,
                                rtol=0.01,
                                atol=0.01)

    op_harness(dropout_op, [x], check_ratio)


@pytest.mark.ipuHardwareRequired
def test_dropout3d_training():
    torch.manual_seed(42)
    drop_ratio = 0.6
    dropout_op = torch.nn.Dropout3d(drop_ratio)

    # Input size needs to be large enough for convergence to expected dropout ratio
    N = 30
    C = 30
    num_channels = torch.as_tensor(N * C, dtype=torch.float)
    sz = [N, C, 2, 2, 1]
    x = torch.ones(sz, dtype=torch.float)

    def check_ratio(_, poptorch_out):
        channel_mask = (poptorch_out == 0).all(-1).all(-1).all(-1)
        actual_ratio = channel_mask.sum() / num_channels
        helpers.assert_allclose(actual=actual_ratio,
                                expected=drop_ratio,
                                rtol=0.01,
                                atol=0.01)

    op_harness(dropout_op, [x], check_ratio)


def test_embedding():
    torch.manual_seed(42)
    model = torch.nn.Embedding(10, 3)
    x = torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]])

    # Run on CPU.
    native_out = model(x)

    # Run on IPU.
    poptorch_model = poptorch.inferenceModel(model)
    poptorch_out = poptorch_model(x)

    assert native_out.size() == poptorch_out.size()
    helpers.assert_allequal(expected=native_out, actual=poptorch_out)


# pylint: disable=unsubscriptable-object
def test_embedding_padding_idx():
    torch.manual_seed(0)

    class TestEmbedding(torch.nn.Module):
        def __init__(self):
            super().__init__()
            torch.manual_seed(0)
            self.embedding = torch.nn.Embedding(10, 4, padding_idx=0)

        def forward(self, x):
            y = self.embedding(x)
            loss = poptorch.identity_loss(y.sum(), "none")
            return y, loss

    model = TestEmbedding()
    # pylint:disable=unsubscriptable-object
    x = torch.arange(0, model.embedding.weight.shape[0])
    y, loss = model(x)
    loss.backward()
    grad = model.embedding.weight.grad

    options = poptorch.Options()
    options.anchorTensor("grad_embedding", "Gradient___embedding.weight")
    pop_model = poptorch.trainingModel(TestEmbedding(), options=options)
    pop_y, pop_loss = pop_model(x)
    pop_grad = pop_model.getAnchoredTensor("grad_embedding")

    helpers.assert_allclose(actual=pop_y, expected=y)
    helpers.assert_allclose(actual=pop_loss, expected=loss)
    helpers.assert_allclose(actual=pop_grad, expected=grad)


@pytest.mark.parametrize("mode", ["max", "mean", "sum"])
def test_embedding_bag(mode):
    torch.manual_seed(0)
    model = torch.nn.EmbeddingBag(10, 3, mode=mode)
    x = torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]])
    cpu_out = model(x)
    pop_model = poptorch.inferenceModel(model)
    pop_out = pop_model(x)
    helpers.assert_allclose(actual=pop_out, expected=cpu_out)


def test_embedding_bag_per_sample_weights():
    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            # per_sample_weights are only supported for mode="sum"
            self.embedding_bag = torch.nn.EmbeddingBag(10, 3, mode="sum")

        def forward(self, x, p):
            return self.embedding_bag(x, per_sample_weights=p)

    torch.manual_seed(0)
    model = Model()
    x = torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]])
    p = torch.randn(2, 4)
    cpu_out = model(x, p)
    pop_model = poptorch.inferenceModel(model)
    pop_out = pop_model(x, p)
    helpers.assert_allclose(actual=pop_out, expected=cpu_out)


@pytest.mark.parametrize("mode", ["max", "mean", "sum"])
def test_embedding_bag_include_last_offset(mode):
    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.weight = torch.nn.Parameter(torch.Tensor(10, 3))
            torch.nn.init.normal_(self.weight)

        def forward(self, x):
            offsets = torch.arange(0, x.numel(), x.size(1))
            offsets = torch.cat((offsets, torch.tensor([x.numel()])))
            x = x.reshape(-1)
            return F.embedding_bag(x,
                                   self.weight,
                                   offsets=offsets,
                                   include_last_offset=True,
                                   mode=mode)

    torch.manual_seed(0)
    model = Model()
    x = torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]])
    cpu_out = model(x)
    pop_model = poptorch.inferenceModel(model)
    pop_out = pop_model(x)
    helpers.assert_allclose(actual=pop_out, expected=cpu_out)


def test_pixel_shuffle():
    torch.manual_seed(42)
    op = torch.nn.PixelShuffle(3)
    x = torch.randn(2, 18, 4, 4)
    op_harness(op, [x])


@pytest.mark.parametrize("params", [(2, 2, 1, 1, 1, 1), (3, 2, 1, 1, 1, 1),
                                    (2, 4, 1, 1, 1, 1), (2, 2, 2, 1, 1, 1),
                                    (2, 2, 1, 3, 1, 1), (2, 2, 1, 1, 3, 1),
                                    (2, 2, 1, 1, 1, 4)])
# Tests aten::im2col
def test_unfold(params):
    (kernel_size_x, kernel_size_y, dilation_x, dilation_y, stride_x,
     stride_y) = params
    padding = 2
    y_in = 19
    x_in = 23
    torch.manual_seed(42)

    unfold_layer = torch.nn.Unfold(kernel_size=(kernel_size_y, kernel_size_x),
                                   dilation=(dilation_y, dilation_x),
                                   padding=padding,
                                   stride=(stride_y, stride_x))

    numel_y = (y_in + 2 * padding - dilation_y *
               (kernel_size_y - 1) - 1) // stride_y + 1
    numel_x = (x_in + 2 * padding - dilation_x *
               (kernel_size_x - 1) - 1) // stride_x + 1
    numel = numel_y * numel_x

    linear_layer = torch.nn.Linear(numel, numel)
    combined = torch.nn.Sequential(unfold_layer, linear_layer)

    inputs = [torch.rand(1, 1, y_in, x_in)]

    op_harness(combined, inputs)


@pytest.mark.parametrize("params", [(2, 2, 1, 1, 1, 1), (3, 2, 1, 1, 1, 1),
                                    (2, 4, 1, 1, 1, 1), (2, 2, 2, 1, 1, 1),
                                    (2, 2, 1, 3, 1, 1), (2, 2, 1, 1, 3, 1),
                                    (2, 2, 1, 1, 1, 3)])
# Tests aten::col2im
def test_fold(params):
    (kernel_size_x, kernel_size_y, dilation_x, dilation_y, stride_x,
     stride_y) = params

    torch.manual_seed(42)
    orig_input = torch.rand(2, 3, 11, 13)

    # unfold the input to provide an input to fold
    unfold_args = {
        "kernel_size": (kernel_size_y, kernel_size_x),
        "dilation": (dilation_y, dilation_x),
        "padding": (0, 0),
        "stride": (stride_y, stride_x)
    }
    unfold = torch.nn.Unfold(**unfold_args)
    unfolded = unfold(orig_input)

    unfold_args["output_size"] = orig_input.shape[2:]

    op = torch.nn.Fold(**unfold_args)
    op_harness(op, [unfolded])


# Tests aten::col2im with padding
@pytest.mark.parametrize("stride_x", [1, 3])
@pytest.mark.parametrize("stride_y", [1, 3])
def test_fold_with_padding(stride_x, stride_y):
    torch.manual_seed(42)

    orig_input = torch.rand(2, 2, 11, 13)

    # unfold the input to provide an input to fold
    unfold_args = {
        "kernel_size": (2, 2),
        "dilation": (1, 1),
        "padding": (2, 2),
        "stride": (stride_y, stride_x)
    }
    unfold = torch.nn.Unfold(**unfold_args)
    unfolded = unfold(orig_input)

    # Since it is zero-padded, add a little to every value
    unfolded += 1.0

    unfold_args["output_size"] = orig_input.shape[2:]

    op = torch.nn.Fold(**unfold_args)
    op_harness(op, [unfolded])


@pytest.mark.parametrize("dim", [0, 1, None])
def test_weight_norm(dim):

    torch.manual_seed(42)

    x = torch.randn(10)

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            lin = torch.nn.Linear(10, 5)
            # Wrap the linear layer with a weight_norm - This should
            # decompose "weight" into "weight_v" and "weight_g"
            self.lin = torch.nn.utils.weight_norm(lin, "weight", dim)

        def forward(self, x):
            x = self.lin(x)
            return x, poptorch.identity_loss(x**2, reduction="sum")

    model = Model()
    weight_v_before = model.lin.weight_v.detach().clone()
    weight_g_before = model.lin.weight_g.detach().clone()

    native_out, _ = model(x)

    poptorch_model = poptorch.trainingModel(model)

    poptorch_out, _ = poptorch_model(x)

    helpers.assert_allclose(expected=native_out, actual=poptorch_out)

    tensor_names = poptorch_model.getTensorNames()
    decomposed_tensors = ["weight_v", "weight_g"]

    # Check that both decomposed tensors exist in the graph
    assert all(f"lin.{t}" in tensor_names for t in decomposed_tensors)
    # Check that they also exist in the backward graph
    assert all(f"UpdatedVar___lin.{t}" in tensor_names
               for t in decomposed_tensors)

    # Ensure that the original weight tensor does NOT exist -
    # autograd should be performed with respect to the decomposed tensors
    # only
    assert "lin.weight" not in tensor_names
    assert "UpdatedVar___lin.weight" not in tensor_names

    n = 3
    # Run a few more times to ensure that the decomposed weights are being
    # updated each time
    for i in range(n):
        weight_v_after = poptorch_model.lin.weight_v.detach().clone()
        weight_g_after = poptorch_model.lin.weight_g.detach().clone()

        # Ensure the decomposed weights changed since the previous iteration
        assert not torch.allclose(weight_v_before, weight_v_after)
        assert not torch.allclose(weight_g_before, weight_g_after)

        # Prepare for the next iteration
        if i != n - 1:
            weight_v_before = weight_v_after
            weight_g_before = weight_g_after

            poptorch_model(x)


================================================
FILE: tests/misc_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import re
import pytest
import torch
import torch.nn as nn
import helpers
import poptorch


@helpers.overridePoptorchLogLevel()
def test_set_log_level():
    for i in range(5):
        poptorch.setLogLevel(i)

    with pytest.raises(ValueError, match="Invalid log level integer"):
        poptorch.setLogLevel(5)

    poptorch.setLogLevel("TRACE")
    poptorch.setLogLevel("DEBUG")
    poptorch.setLogLevel("INFO")
    poptorch.setLogLevel("WARN")
    poptorch.setLogLevel("ERR")
    poptorch.setLogLevel("OFF")

    err_str = "Unknown log level: wibble. Valid values are DEBUG, ERR, INFO, "
    err_str += "OFF, TRACE and WARN"

    with pytest.raises(ValueError, match=err_str):
        poptorch.setLogLevel("wibble")


@helpers.printCapfdOnExit
@helpers.overridePopartLogLevel()
def test_set_popart_log_level(capfd):
    # Only strings are allowed
    with pytest.raises(ValueError, match="Level must be one of"):
        poptorch._logging.setPopartLogLevel(0)  # pylint: disable=protected-access

    # Only some strings are allowed
    with pytest.raises(ValueError, match="Level must be one of"):
        poptorch._logging.setPopartLogLevel("FOO")  # pylint: disable=protected-access

    poptorch._logging.setPopartLogLevel("DEBUG")  # pylint: disable=protected-access
    poptorch._logging.setPopartLogLevel("INFO")  # pylint: disable=protected-access
    poptorch._logging.setPopartLogLevel("WARN")  # pylint: disable=protected-access

    model = torch.nn.Linear(2, 2)

    inference_model = poptorch.inferenceModel(model)
    inference_model(torch.randn([2, 2]))

    log = helpers.LogChecker(capfd)
    log.assert_no_matches(r"popart:devicex \d+\.\d+ T:")
    log.assert_no_matches(r"popart:ir \d+\.\d+ D:")
    log.assert_no_matches(r"popart:ir \d+\.\d+ I:")
    log.assert_no_matches(r"popart:session \d+\.\d+ T:")
    log.assert_no_matches(r"popart:popart \d+\.\d+ T:")

    poptorch._logging.setPopartLogLevel("ERR")  # pylint: disable=protected-access
    poptorch._logging.setPopartLogLevel("OFF")  # pylint: disable=protected-access
    poptorch._logging.setPopartLogLevel("TRACE")  # pylint: disable=protected-access

    inference_model = poptorch.inferenceModel(model)
    inference_model(torch.randn([2, 2]))

    log = helpers.LogChecker(capfd)
    log.assert_matches(r"popart:devicex \d+\.\d+ T:")
    log.assert_matches(r"popart:ir \d+\.\d+ D:")
    log.assert_matches(r"popart:ir \d+\.\d+ I:")
    log.assert_matches(r"popart:session \d+\.\d+ T:")
    log.assert_matches(r"popart:popart \d+\.\d+ T:")


def test_zero_size_tensor_error():
    class Model(torch.nn.Module):
        def forward(self, x):
            # The operation doesn't matter, we just want to produce the
            # failure on an operation that works with zero-sized tensors
            # in native Torch
            return torch.nn.functional.interpolate(x, size=(10, 10))

    x = torch.randn(0, 2, 5, 5)
    poptorch_model = poptorch.inferenceModel(Model())

    with pytest.raises(
            poptorch.Error,
            match=
            r"Zero-sized tensors are unsupported \(Got shape \[0, 2, 5, 5\]\)"
    ):
        poptorch_model(x)


def test_torch_backward_error():
    x = torch.Tensor([5.0])
    model = helpers.ModelWithWeights(lambda x: x, x.shape)
    poptorch_model = poptorch.trainingModel(model)
    poptorch_out, poptorch_loss = poptorch_model((x, ))

    error_message = (
        r"backward\(\) cannot be called explicitly on "
        r"outputs of a PopTorch model. If you're using a trainingModel, "
        r"the backwards pass is performed automatically when invoking the "
        r"model. If you're using an inferenceModel, you should use a "
        r"trainingModel instead.")

    with pytest.raises(poptorch.Error, match=error_message):
        poptorch_out.backward()
    with pytest.raises(poptorch.Error, match=error_message):
        poptorch_loss.backward()


@pytest.mark.parametrize(
    "error_type", poptorch.poptorch_core.TestErrorType.__members__.values())
def test_generic_error_handling(error_type):
    with pytest.raises(poptorch.Error) as e:
        poptorch.poptorch_core._throwTestError(error_type)  # pylint: disable=protected-access
    assert "throwTestError::bottomLevel" in e.value.args[0]
    assert "throwTestError::topLevel" in e.value.args[0]


def test_specific_error_handling():
    try:
        poptorch.poptorch_core._throwTestError(  # pylint: disable=protected-access
            poptorch.poptorch_core.TestErrorType.PoplarRecoverableFullReset)
        assert False, "Expected an error to be thrown"
    except poptorch.RecoverableError as e:
        assert e.recovery_action == "FULL_RESET"
        assert "throwTestError::bottomLevel" in e.location
        assert "throwTestError::topLevel" in e.location
        assert e.type == "poplar_recoverable_runtime_error"
        # Message shouldn't contain any backtrace
        assert "throwTestError::bottomLevel" not in e.message
        assert "throwTestError::topLevel" not in e.message

    try:
        poptorch.poptorch_core._throwTestError(  # pylint: disable=protected-access
            poptorch.poptorch_core.TestErrorType.PoplarLinkError)
        assert False, "Expected an error to be thrown"
    except poptorch.Error as e:
        # Make sure the backtrace was reset between the two exceptions
        assert e.location.count("throwTestError::bottomLevel") == 1
        assert e.location.count("throwTestError::topLevel") == 1
        assert e.type == "poplar_link_error"
        # Message shouldn't contain any backtrace
        assert "throwTestError::bottomLevel" not in e.message
        assert "throwTestError::topLevel" not in e.message

        # Make sure the link error is added at the end of the error message
        assert "-lfoo not found" in e.message

    try:
        poptorch.poptorch_core._throwTestError(  # pylint: disable=protected-access
            poptorch.poptorch_core.TestErrorType.PoplarUnrecoverable)
        assert False, "Expected an error to be thrown"
    except poptorch.UnrecoverableError as e:
        # Make sure the backtrace was reset between the two exceptions
        assert e.location.count("throwTestError::bottomLevel") == 1
        assert e.location.count("throwTestError::topLevel") == 1
        assert e.type == "poplar_unrecoverable_runtime_error"
        # Message shouldn't contain any backtrace
        assert "throwTestError::bottomLevel" not in e.message
        assert "throwTestError::topLevel" not in e.message


@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("DEBUG")
@helpers.overridePopartLogLevel("DEBUG")
def test_outline_attribute(capfd):
    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.gn1 = torch.nn.GroupNorm(4, 8)
            self.gn2 = torch.nn.GroupNorm(2, 8)

        def forward(self, x):
            with poptorch.Attribute(__outline={"layer": "embedding"}):
                x = self.gn1(x)
            return self.gn2(x)

    input = torch.randn(3, 8)

    poptorch_model = poptorch.inferenceModel(Model())

    poptorch_model(input)

    testlog = helpers.LogChecker(capfd)

    get_regex = lambda op_name: (f'Op "{op_name}/.+", '
                                 r"[0-9]+ of type ai\.graphcore\."
                                 ".+:1"
                                 r"(?:\n.+)+"
                                 f"{op_name}"
                                 r".+(?:\n.+)+"
                                 "layer: layer:embedding")

    # Ensure the first group norm has the outline attribute
    testlog.assert_matches(get_regex("gn1"), per_line=False)

    # Ensure the second group norm doesn't have the attribute,
    # as it is outside the attribute scope
    testlog.assert_no_matches(get_regex("gn2"), per_line=False)

    it = testlog.createIterator()
    it.findNext("lowered to PopART")
    # Ensure none of the attributes key / values are actually lowered to PopART
    # (They should have been converted to attributes)
    it.assert_not_contains("Char")


# Note: the ipu models are not supported by poptorch.ConnectionType.Never
@pytest.mark.ipuHardwareRequired
def test_compile_without_ipu():
    class SimpleAdder(nn.Module):
        def forward(self, x, y):
            return x + y

    model = SimpleAdder()
    opts = poptorch.Options().connectionType(poptorch.ConnectionType.Never)
    inference_model = poptorch.inferenceModel(model, opts)

    t1 = torch.tensor([1.])
    t2 = torch.tensor([2.])

    inference_model.compile(t1, t2)


def test_error_on_cpu_tensor():
    class Model(nn.Module):
        def forward(self, x):
            return torch.index_select(x, 0, torch.LongTensor([1, 0]))

    model = Model()
    inference_model = poptorch.inferenceModel(model)

    t1 = torch.rand(4)
    with pytest.raises(poptorch.Error,
                       match=re.escape(
                           "Expected an IPU tensor but got tensor(device=cpu, "
                           "shape=[2], dtype=Long)")):
        inference_model.compile(t1)


================================================
FILE: tests/multiconv_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
from io import StringIO
import json
import pytest
import torch
from torch import nn
import helpers
import poptorch


def getPopartMultiConvs(poptorch_model):
    ir_as_json = json.load(StringIO(poptorch_model._debugGetPopartIR()))  # pylint: disable=protected-access
    assert "maingraph" in ir_as_json, "Expected maingraph in serialized IR."

    r = []
    for op in ir_as_json["maingraph"]:
        if op["type"] == "MultiConv":
            r.append(op)

    return r


def assert_contains_multiconv(poptorch_model, expected_num=1):
    num_multiconv = len(getPopartMultiConvs(poptorch_model))
    msg = (f"Wrong number of MultiConv ops.\n"
           f"   Expected : {expected_num}\n"
           f"   Actual   : {num_multiconv}.")
    assert num_multiconv == expected_num, msg


@pytest.mark.parametrize("num_layers", [1, 2, 3])
def test_multiconv_basic(num_layers):
    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.convA = nn.Conv2d(1, 1, 5)
            self.convB = nn.Conv2d(1, 1, 5, bias=False)

        def forward(self, x):
            with poptorch.MultiConv():
                a = self.convA(x)
                absx = torch.abs(x)
                b = self.convB(absx)
                return a + b

    m = [Model() for i in range(num_layers)]
    m = torch.nn.Sequential(*m)
    torch.manual_seed(0)
    input = torch.randn(2, 1, 28, 28)

    native = m(input)

    poptorch_model = poptorch.inferenceModel(m)
    poptorch_out = poptorch_model(input)
    assert_contains_multiconv(poptorch_model, num_layers)

    for cpu, pop in zip(native, poptorch_out):
        helpers.assert_allclose(expected=cpu, actual=pop)


def multiconv_harness(multiconv):
    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.conv1 = nn.Conv2d(1, 10, 5)
            self.conv2 = nn.Conv2d(1, 10, 5)
            self.MultiConv = multiconv

        def forward(self, x):
            y = torch.pow(x, 2)

            with self.MultiConv:
                u = self.conv1(x)
                v = self.conv2(y)

            return u - v

    m = Model()
    torch.manual_seed(0)
    x = torch.randn(2, 1, 28, 28)

    native = m(x)
    poptorch_model = poptorch.inferenceModel(m)
    poptorch_out = poptorch_model(x)
    helpers.assert_allclose(expected=native, actual=poptorch_out)
    assert_contains_multiconv(poptorch_model)


def test_multiconv_options_broadcast():
    multiconv = (
        poptorch.MultiConv().availableMemoryProportions(0.8).partialsTypes(
            torch.float).planType(
                poptorch.MultiConvPlanType.Parallel).perConvReservedTiles(
                    100).cycleBackOff(0.3)).enableConvDithering(True)

    multiconv_harness(multiconv)


def test_multiconv_options_per_conv():
    partials_types = [torch.float, torch.float]
    multiconv = (poptorch.MultiConv().availableMemoryProportions(
        (0.8, 0.7)).partialsTypes(partials_types).planType(
            poptorch.MultiConvPlanType.Parallel).perConvReservedTiles(
                120).cycleBackOff(0.4)).enableConvDithering(True)

    multiconv_harness(multiconv)


def test_multiconv_layers():
    class Network(nn.Module):
        def __init__(self):
            super().__init__()

            self.layer1A = nn.Sequential(nn.Conv2d(1, 10, 5), nn.MaxPool2d(2),
                                         nn.ReLU())
            self.layer1B = nn.Sequential(nn.Conv2d(1, 10, 5), nn.MaxPool2d(2),
                                         nn.ReLU())
            self.layer2 = nn.Sequential(nn.Conv2d(10, 20, 5), nn.MaxPool2d(2),
                                        nn.ReLU())
            self.layer3 = nn.Linear(320, 256)
            self.layer3_act = nn.ReLU()
            self.layer4 = nn.Linear(256, 10)

            self.softmax = nn.LogSoftmax(1)

        def forward(self, x):
            with poptorch.MultiConv():
                absx = torch.abs(x)
                y = self.layer1A(absx)
                z = self.layer1B(x)
                x = y + z

            x = self.layer2(x)
            x = x.view(-1, 320)
            x = self.layer3_act(self.layer3(x))
            x = self.layer4(x)
            x = self.softmax(x)
            return x

    model = Network()
    # Run on CPU.
    input = torch.randn(2, 1, 28, 28)
    native_out = model(input)
    poptorch_model = poptorch.inferenceModel(model)
    poptorch_out = poptorch_model(input)

    assert_contains_multiconv(poptorch_model)
    helpers.assert_allclose(actual=poptorch_out, expected=native_out)


def test_invalid_multiconv_nested():
    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.conv = nn.Conv2d(1, 10, 10)

        def forward(self, x):
            with poptorch.MultiConv():
                with poptorch.MultiConv():
                    return self.conv(x)

    m = Model()
    poptorch_model = poptorch.inferenceModel(m)
    msg = "Nested poptorch.MultiConv is not supported"

    with pytest.raises(poptorch.Error, match=msg):
        poptorch_model(torch.zeros(2, 1, 32, 32))


def test_invalid_multiconv_empty():
    class Model(torch.nn.Module):
        def forward(self, x):
            with poptorch.MultiConv():
                return torch.pow(x, 2)

    m = Model()
    poptorch_model = poptorch.inferenceModel(m)
    msg = "Unexpected end_multi_conv"

    with pytest.raises(poptorch.Error, match=msg):
        poptorch_model(torch.ones(2, 2))


def test_invalid_multiconv_options():
    mc = poptorch.MultiConv()

    with pytest.raises(ValueError, match="Invalid partials types"):
        mc.partialsTypes("half")

    with pytest.raises(AssertionError, match="Invalid plan type"):
        mc.planType("parallel")


================================================
FILE: tests/non_contiguous_tensors_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import torch
import helpers
import poptorch


class FiveAdder(torch.nn.Module):
    def forward(self, in_1, in_2, in_3, in_4, in_5):
        return in_1 + in_2 + in_3 + in_4 + in_5


def test_non_contiguous():
    torch.manual_seed(23148)

    model = FiveAdder()
    poptorch_model = poptorch.inferenceModel(model)

    OUTER_DIM = 1000
    INNER_DIM = 40

    nc1 = torch.randn([OUTER_DIM, INNER_DIM + 1])[:, 0:INNER_DIM]
    nc2 = torch.transpose(torch.randn([INNER_DIM, OUTER_DIM]), 0, 1)
    nc3 = torch.tensor([1.0]).expand([OUTER_DIM, INNER_DIM])

    c1 = torch.randn([OUTER_DIM, INNER_DIM])
    c2 = torch.randn([2, OUTER_DIM, INNER_DIM])[0, :, :]

    assert not nc1.is_contiguous()
    assert not nc2.is_contiguous()
    assert not nc3.is_contiguous()

    assert c1.is_contiguous()
    assert c2.is_contiguous()

    native_out = model(nc1, c1, nc2, c2, nc3)
    poptorch_out = poptorch_model(nc1, c1, nc2, c2, nc3)

    assert native_out.shape == (OUTER_DIM, INNER_DIM)

    print(native_out)
    print(poptorch_out)

    helpers.assert_allclose(actual=poptorch_out, expected=native_out)


================================================
FILE: tests/norms_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import os  # pylint: disable=unused-import
import unittest.mock
from copy import deepcopy
import torch
import torch.optim as optim
import torch.nn as nn
import pytest
import helpers
import poptorch

# Norms
#'torch.nn.BatchNorm1d', 'torch.nn.BatchNorm2d', 'torch.nn.BatchNorm3d', 'torch.nn.GroupNorm', 'torch.nn.SyncBatchNorm', 'torch.nn.SyncBatchNorm.convert_sync_batchnorm',
# 'torch.nn.InstanceNorm1d', 'torch.nn.InstanceNorm2d', 'torch.nn.InstanceNorm3d', 'torch.nn.LayerNorm', 'torch.nn.LocalResponseNorm',

batch_norm_params = [
    # Norm, affine, running_stats, training
    (nn.BatchNorm1d, False, False, False),
    (nn.BatchNorm2d, True, True, False),
]


@pytest.mark.parametrize("batch_norm, affine, running_stats, training",
                         batch_norm_params)
@unittest.mock.patch.dict("os.environ", helpers.disableSmallModel())
def test_batchNorm(batch_norm, affine, running_stats, training):
    torch.manual_seed(42)
    C = 4
    input_shape = [3, C, 5]
    if batch_norm in (nn.BatchNorm2d, nn.BatchNorm3d):
        input_shape.append(6)
    if batch_norm is nn.BatchNorm3d:
        input_shape.append(7)
    input = torch.randn(input_shape)

    norm = batch_norm(C, affine=affine, track_running_stats=running_stats)

    # pylint: disable=W0212
    norm._buffers["running_mean"] = torch.randn([C])
    norm._buffers["running_var"] = torch.clamp(torch.randn([C]) + 1.0, min=0.1)
    norm.train(training)

    model = helpers.ModelWithWeights(norm, input.shape)

    ipumodel = deepcopy(model)
    poptorch_model = poptorch.trainingModel(
        ipumodel) if training else poptorch.inferenceModel(ipumodel)

    # Run pytorch native on CPU.
    native_out, _ = model((input, ))

    # Run on IPU.
    poptorch_out, _ = poptorch_model((input, ))

    # Inference test - check outputs
    helpers.assert_allclose(actual=poptorch_out, expected=native_out)

    # Training test - check weights changed
    if training:
        poptorch_model.assert_weights_changed()


def test_batchNorm_typing():
    torch.manual_seed(42)

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.bn = nn.BatchNorm1d(100)

        def forward(self, x, y):
            return self.bn(x) + y

    m = Model()
    ipu_model = poptorch.inferenceModel(m)

    x = torch.randn(20, 100, dtype=torch.half)
    y = torch.randn(20, 100, dtype=torch.half)

    ipu_model(x, y)


def test_batchNorm_eval_during_training():
    torch.manual_seed(42)

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.bn = nn.BatchNorm1d(100)
            self.loss = torch.nn.MSELoss()

        def forward(self, x, target):
            y = self.bn(x)
            return y, self.loss(y, target)

    input = torch.randn([16, 100])
    target = torch.randn([16, 100])

    model = Model()
    for param in model.parameters():
        param.requires_grad = False
    model.bn.eval()

    running_mean_init = model.bn.running_mean.clone().detach()
    running_var_init = model.bn.running_var.clone().detach()

    # Run pytorch native on CPU.
    native_out, _ = model(input, target)
    # Run on IPU.
    ipu_model = poptorch.trainingModel(model)
    poptorch_out, _ = ipu_model(input, target)
    # TODO: T38684
    # Implicit copy only happens when we touch the params so copy explicitly.
    ipu_model.copyWeightsToHost()

    helpers.assert_allclose(actual=poptorch_out, expected=native_out)
    helpers.assert_allequal(actual=model.bn.running_mean,
                            expected=running_mean_init)
    helpers.assert_allequal(actual=model.bn.running_var,
                            expected=running_var_init)


@pytest.mark.parametrize("norm_dim", range(4))
def test_layerNorm(norm_dim):
    torch.manual_seed(42)

    elementwise_affine = norm_dim % 2 == 1

    input = torch.randn([3, 2, 5, 2])
    layerNorm = nn.LayerNorm(input.shape[norm_dim:],
                             elementwise_affine=elementwise_affine)

    model = helpers.ModelWithWeights(layerNorm, input.shape)

    # Run pytorch native on CPU.
    native_out, _ = model((input, ))

    poptorch_model = poptorch.trainingModel(model)
    # Run on IPU.
    poptorch_out, _ = poptorch_model((input, ))

    # Inference test - check outputs
    helpers.assert_allclose(actual=poptorch_out,
                            expected=native_out,
                            atol=1e-4,
                            rtol=1e-4)

    # Training test - check weights changed
    poptorch_model.assert_weights_changed()


def test_layerNormPretrainedWeights():
    torch.manual_seed(42)

    class Model(nn.Module):
        def __init__(self):
            super().__init__()
            self.conv = nn.Conv2d(5, 5, kernel_size=(1, 1))
            self.norm = nn.LayerNorm((5, 3, 10))

        def forward(self, x):
            x = self.conv(x)

            return self.norm(x)

    model = Model()

    input = torch.randn([3, 5, 3, 10])

    modelOut = model(input)

    # Run on IPU.
    ipuModel = poptorch.inferenceModel(model)
    poptorch_out = ipuModel(input)

    # Marginally more leeway.
    helpers.assert_allclose(actual=poptorch_out,
                            expected=modelOut,
                            rtol=1e-4,
                            atol=1e-6)

    # We aren't training to any real target we just want to update the beta/gamma parameters and check they still work in popart.
    criterion = nn.MSELoss()
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

    model.train()
    for _ in range(0, 10):
        outputs = model(input)
        optimizer.zero_grad()
        loss = criterion(outputs, torch.ones([3, 5, 3, 10]))
        loss.backward()
        optimizer.step()

    model.eval()
    # Run on IPU with trained weights.
    ipuModel = poptorch.inferenceModel(model)
    poptorch_out = ipuModel(input)

    # Run on CPU again with trained weights.
    outputs = model(input)

    helpers.assert_allclose(actual=poptorch_out,
                            expected=outputs,
                            rtol=1e-4,
                            atol=1e-6)


@pytest.mark.parametrize("dims", {2, 3, 4, 5})
def test_groupNorm(dims):
    if dims == 2:
        # TODO(T49073): Match torch 1.10 GroupNorm implementation
        pytest.skip("Numerical differences between PyTorch and PopTorch")

    torch.manual_seed(42)

    affine = dims % 2 == 0

    shape = [3, 10]
    if dims > 2:
        rand_shape = torch.randint(2, 5, [dims - 2])
        shape.extend(rand_shape.tolist())

    input = torch.randn(shape)
    groupNorm = nn.GroupNorm(5, 10, affine=affine)
    model = helpers.ModelWithWeights(groupNorm, input.shape)

    # Run pytorch native on CPU.
    native_out, _ = model((input, ))

    # Run on IPU.
    poptorch_model = poptorch.trainingModel(model)
    poptorch_out, _ = poptorch_model((input, ))

    # Inference test - check outputs
    helpers.assert_allclose(actual=poptorch_out, expected=native_out)

    # Training test - check weights changed
    poptorch_model.assert_weights_changed()


def test_groupNorm_exfail():
    torch.manual_seed(42)

    shape = [3, 10]

    input = torch.randn(shape)
    groupNorm = nn.GroupNorm(5, 10)

    # Run pytorch native on CPU.
    native_output = groupNorm(input)

    opts = poptorch.Options()
    opts._Popart.set("groupNormStridedChannelGrouping", True)  # pylint: disable=protected-access

    # Run on IPU.
    ipuModel = poptorch.inferenceModel(groupNorm, opts)
    poptorch_out = ipuModel(input)

    # Group norm is pending correctness changes in popart/poplar so we will just test the shape/type for now.
    assert poptorch_out.size() == native_output.size()
    assert poptorch_out.type() == native_output.type()

    assert not torch.allclose(poptorch_out, native_output, atol=1e-1, rtol=0.1)


def test_groupNorm_typing():
    torch.manual_seed(42)

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.gn = torch.nn.GroupNorm(4, 16)

        def forward(self, x):
            return self.gn(x)

    m = Model()
    ipu_model = poptorch.inferenceModel(m)

    x = torch.randn(20, 16, 50, dtype=torch.half)

    assert ipu_model(x).dtype == torch.half


instance_norm_params = [
    # norm, dims
    (nn.InstanceNorm1d, 1),
    (nn.InstanceNorm2d, 2),
    (nn.InstanceNorm3d, 3)
]


@pytest.mark.parametrize("instance_norm, d", instance_norm_params)
def test_instanceNorm(instance_norm, d):
    torch.manual_seed(42)

    affine = d % 2 == 1

    class Model(nn.Module):
        def __init__(self):
            super().__init__()
            self.norm = instance_norm(6, affine=affine)
            self.fc1 = nn.Linear(6 * 2**d, 10)
            self.loss = nn.CrossEntropyLoss()

        def forward(self, x, target):
            out = self.norm(x)
            out = out.flatten(1)
            out = self.fc1(out)
            loss = self.loss(out, target)

            return out, loss

    for _ in range(3):
        model = Model()
        opt = optim.AdamW(model.parameters(), lr=0.01)
        poptorch_model = poptorch.trainingModel(model, optimizer=opt)

        shape = [5, 6]
        shape.extend([2 for _ in range(d)])

        # Offset the data by multiplying by random values and shifting by a random bias
        input = torch.randint(2, 10, shape) * torch.randn(
            shape) + torch.randint(2, 10, [1]) * torch.randn(1)
        label = torch.randint(0, 10, [shape[0]])

        _, original_loss = poptorch_model(input, label)

        for _ in range(0, 100):
            out, loss = poptorch_model(input, label)

        # Check we have trained the model
        assert loss < original_loss
        assert loss < 0.03
        helpers.assert_allequal(actual=torch.argmax(out, dim=1),
                                expected=label)


def test_batchnorm_statistics():
    torch.manual_seed(42)

    input_data = [torch.randn([4, 4, 3, 3]) for _ in range(10)]
    label = torch.ones(4).long()

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.bn = torch.nn.BatchNorm2d(4)
            self.loss = torch.nn.CrossEntropyLoss()

        def forward(self, args, loss_inputs=None):
            output = self.bn(args)
            if loss_inputs is None:
                return output

            reduced = torch.mean(output, dim=(2, 3))
            return output, self.loss(reduced, loss_inputs)

    model1 = Model()
    model1.train()
    optimizer = optim.SGD(model1.parameters(), lr=0.0)
    training_model = poptorch.trainingModel(model1, optimizer=optimizer)

    for data in input_data:
        training_model(data, label)

    model2 = Model()
    model2.train()
    for data in input_data:
        model2(data)

    # Shouldn't be needed but buffers alone don't trigger the copy.
    training_model.copyWeightsToHost()

    # Running mean is very close
    helpers.assert_allclose(actual=model2.bn.running_mean,
                            expected=model1.bn.running_mean)

    # Running var is not so close.
    helpers.assert_allclose(actual=model2.bn.running_var,
                            expected=model1.bn.running_var)


@pytest.mark.parametrize('p',
                         (1, 2, 1.0, 2.0, float('inf'), float('-inf'), 'fro'))
def test_norm_in_loop(p):
    embedding = torch.nn.Parameter(torch.randn((200, 100)))
    num_loops = 3

    class Model(torch.nn.Module):
        def __init__(self, ):
            super().__init__()
            self.embedding = embedding

        def forward(self):
            def loop_body(norm):
                norm += torch.norm(self.embedding[:100], p=p, dim=-1)
                return norm

            cumulative_norm = torch.zeros(100, device=self.embedding.device)
            (cumulative_norm, ) = poptorch.for_loop(
                num_loops,
                loop_body,
                [cumulative_norm],
            )
            return cumulative_norm

    class RefModel(torch.nn.Module):
        def __init__(self, ):
            super().__init__()
            self.embedding = embedding

        def forward(self):
            cumulative_norm = torch.zeros(100, device=self.embedding.device)
            for _ in range(num_loops):
                cumulative_norm += torch.norm(self.embedding[:100],
                                              p=p,
                                              dim=-1)
            return cumulative_norm

    native = Model()
    ipu = poptorch.inferenceModel(native)
    ipu_out = ipu()
    native_out = RefModel()()

    helpers.assert_allclose(actual=ipu_out, expected=native_out)


================================================
FILE: tests/ops_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import re
import torch
import helpers
import poptorch


def test_print_tensor():
    class Model(torch.nn.Module):
        def forward(self, x):
            return poptorch.ipu_print_tensor(x)

    m = poptorch.inferenceModel(Model())
    m(torch.randn(5))


def test_print_tensor_with_title():
    class Model(torch.nn.Module):
        def forward(self, x):
            return poptorch.ipu_print_tensor(x, "my_tensor")

    m = poptorch.inferenceModel(Model())
    m(torch.randn(5))


def test_nop():
    class Model(torch.nn.Module):
        def forward(self, x):
            return poptorch.nop(x) * 2

    m = poptorch.inferenceModel(Model())
    m(torch.randn(5))


def test_name_scope():
    class Model(torch.nn.Module):
        def forward(self, x, y):
            with poptorch.NameScope("NameScope"):
                return x + y

    model = Model()
    poptorch_model = poptorch.inferenceModel(model)

    torch.manual_seed(42)
    x = torch.randn(10, 10)
    y = torch.randn(10, 10)
    poptorch_model(x, y)

    ir = poptorch_model._debugGetPopartIR()  # pylint: disable=protected-access
    assert ir.find('"name":"NameScope/Add:InPlace"') != -1


@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("TRACE")
def test_available_memory_last_op(capfd):
    class Model(torch.nn.Module):
        def forward(self, x):
            x = torch.matmul(x, x)
            return poptorch.set_available_memory(x, 0.3)

    input = torch.randn(10, 10)
    poptorch_model = poptorch.inferenceModel(Model())
    poptorch_model.compile(input)

    # Check the trace log to make sure set_available_memory isn't pruned
    # before it's lowered to PopART
    ir_before_popart_regex = \
    (r"Graph before lowering to PopART:\n"
     r".*\n"
     r".* popart::matmul.*\n"
     r".* poptorch::set_available_memory.*")

    log = helpers.LogChecker(capfd)
    log.assert_matches(ir_before_popart_regex, per_line=False)


@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("TRACE")
def test_available_memory_linear(capfd):
    class LinModel(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.conv = torch.nn.Conv2d(3, 3, 3)
            self.lin = torch.nn.Linear(3, 3)

        def forward(self, x):
            x = self.conv(x)
            x = self.lin(x)
            x = poptorch.set_available_memory(x, 0.3)
            return x

    x = torch.rand(2, 3, 5, 5)
    model = LinModel()
    poptorch_model = poptorch.inferenceModel(model)
    poptorch_model(x)

    log = helpers.LogChecker(capfd)
    it = log.createIterator()
    # Assert that the set_available_memory node references the matmul, not the
    # add.
    it.findNext("Graph before lowering to PopART:")
    matmul_line = it.findNext("popart::matmul").strip()
    matmul_var = matmul_line.partition(" ")[0]
    sam_line = it.findNext("poptorch::set_available_memory").strip()
    actual_var = re.match(r".*set_available_memory[^\(]+\(([^\)]+).*",
                          sam_line).group(1)
    assert actual_var == matmul_var


================================================
FILE: tests/optimizers_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import copy
from io import StringIO
import json
import os
import tempfile
import unittest.mock

import pytest
import torch
import torch.optim as optim
from torch.optim.lr_scheduler import ExponentialLR
import helpers
import poptorch


# Convenience classes for testing
class LAMBNoBias(poptorch.optim.LAMB):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, bias_correction=False, **kwargs)


class AdamWNoBias(poptorch.optim.AdamW):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, bias_correction=False, **kwargs)


poptorch_optimizers = [
    poptorch.optim.SGD, poptorch.optim.Adam, poptorch.optim.AdamW,
    poptorch.optim.RMSprop, poptorch.optim.LAMB, LAMBNoBias, AdamWNoBias
]

supported_torch_optimizers = [
    optim.SGD, optim.Adam, optim.AdamW, optim.RMSprop
]

all_optimizers = poptorch_optimizers + supported_torch_optimizers


def assert_is_ipu_optimizer_state(state, should_be_empty=False):
    assert isinstance(state, dict)
    assert "ipu_state" in state
    assert "ipu_param" in state
    if should_be_empty:
        assert state["ipu_state"] is None
        assert state["ipu_param"] is None
    else:
        assert isinstance(state["ipu_state"], dict) and all(
            isinstance(k, str) and isinstance(v, torch.Tensor)
            for k, v in state["ipu_state"].items()), state
        assert isinstance(state["ipu_param"], dict) and all(
            isinstance(k, str) and isinstance(v, torch.Tensor)
            for k, v in state["ipu_param"].items()), state
        assert len(state["ipu_param"]) > 0, "All optimizers have parameters"
        # Not all optimizers have a state though


class OptimizerTestModel:
    def __init__(self, options=None, num_groups=1):
        layers = [torch.nn.Linear(10, 10) for _ in range(num_groups)]
        if num_groups == 1:
            base_model = layers[0]
        else:
            base_model = torch.nn.Sequential(*layers)
        self.input = torch.randn(1, 10)
        self.label = torch.randint(0, 10, [1])
        self.options = options

        class Model(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.base_model = base_model
                self.loss = torch.nn.CrossEntropyLoss()

            def forward(self, data, target):
                out = self.base_model(data)
                loss = self.loss(out, target)
                return out, loss

        self.model = Model()
        self.poptorch_model = None

    def parameters(self):
        return self.model.parameters()

    def setOptimizer(self, optimizer):
        if self.poptorch_model is None:
            self.poptorch_model = poptorch.trainingModel(self.model,
                                                         self.options,
                                                         optimizer=optimizer)
        else:
            self.poptorch_model.setOptimizer(optimizer)

    def run(self):
        if self.poptorch_model is None:
            raise RuntimeError("Call setOptimizer first.")

        out_loss = self.poptorch_model(self.input, self.label)
        return out_loss


@pytest.mark.parametrize("opt", helpers.onlyFirstIfReduced(all_optimizers))
def test_optimizer(opt):
    torch.manual_seed(42)

    model = OptimizerTestModel()

    # "Train" with learning rate of zero and check the loss remains the same.
    if opt == poptorch.optim.SGD:
        optimizer = opt(model.parameters(), lr=0.00, use_combined_accum=False)
    else:
        optimizer = opt(model.parameters(), lr=0.00)

    # Make sure the first run doesn't already pass the test.
    model.setOptimizer(optimizer)
    _, original_loss = model.run()

    # Loss shouldn't change.
    for _ in range(0, 50):
        out, loss = model.run()
        assert loss == original_loss

    # We shouldn't get the right result.
    assert not torch.argmax(out, dim=1) == model.label

    # Update the optimizer and check the loss now begins to decrease.
    optimizer.param_groups[0]['lr'] = 0.01
    model.setOptimizer(optimizer)
    for _ in range(0, 1000):
        out, loss = model.run()

    # Check we have trained the "model"
    assert loss < original_loss
    assert loss < 0.03
    assert torch.argmax(out, dim=1) == model.label


@pytest.mark.parametrize(
    "opt", {optim.SGD, optim.AdamW, poptorch.optim.SGD, poptorch.optim.AdamW})
def test_sgd_IR(opt):
    torch.manual_seed(42)
    model = OptimizerTestModel()

    # "Train" with learning rate of zero and check the loss remains the same.
    if opt == poptorch.optim.SGD:
        optimizer = opt(model.parameters(), lr=0.01, use_combined_accum=False)
    else:
        optimizer = opt(model.parameters(), lr=0.01)

    model.setOptimizer(optimizer)
    model.run()

    as_json = json.load(StringIO(model.poptorch_model._debugGetPopartIR()))  # pylint: disable=protected-access

    AdamVarUpdate = 0
    AdamUpdater = 0
    SGD0VarUpdate = 0
    for name in as_json:
        assert name == "maingraph"
        for op in as_json[name]:
            if op['type'] == "AdamUpdater":
                AdamUpdater += 1
            elif op['type'] == "AdamVarUpdate":
                AdamVarUpdate += 1
            elif op['type'] == "SGD0VarUpdate":
                SGD0VarUpdate += 1

    if opt in (optim.SGD, poptorch.optim.SGD):
        assert SGD0VarUpdate == 2
        assert AdamVarUpdate == 0 and AdamUpdater == 0
    else:
        assert SGD0VarUpdate == 0
        assert AdamVarUpdate == 2 and AdamUpdater == 2


@helpers.printCapfdOnExit
@pytest.mark.parametrize("opt",
                         helpers.onlyFirstIfReduced(
                             (poptorch.optim.Adam, poptorch.optim.AdamW,
                              AdamWNoBias, poptorch.optim.LAMB, LAMBNoBias)))
@pytest.mark.parametrize("accum_type", (torch.float16, torch.float))
@pytest.mark.parametrize("first_order_type", (torch.float16, torch.float))
@pytest.mark.parametrize("second_order_type", (torch.float16, torch.float))
@helpers.overridePoptorchLogLevel("DEBUG")
def test_adam_accum_type(capfd, opt, accum_type, first_order_type,
                         second_order_type):
    def torchTypeToStr(dt):
        t = str(dt)
        assert t in ["torch.float32", "torch.float16"]
        return t.split(".")[1]

    torch.manual_seed(42)
    model = OptimizerTestModel()

    # "Train" with learning rate of zero and check the loss remains the same.
    optimizer = opt(model.parameters(),
                    lr=0.01,
                    accum_type=accum_type,
                    first_order_momentum_accum_type=first_order_type,
                    second_order_momentum_accum_type=second_order_type)
    model.setOptimizer(optimizer)
    model.run()
    testlog = helpers.LogChecker(capfd)
    testlog.assert_matches(
        "graph optimizer", "accumType=" + torchTypeToStr(accum_type),
        "firstOrderMomentumAccumType=" + torchTypeToStr(first_order_type),
        "secondOrderMomentumAccumType=" + torchTypeToStr(second_order_type))


@helpers.printCapfdOnExit
@pytest.mark.parametrize("accum_type", (torch.float16, torch.float))
@pytest.mark.parametrize("velocity_accum_type", (torch.float16, torch.float))
@helpers.overridePoptorchLogLevel("DEBUG")
def test_sgd_accum_type(capfd, accum_type, velocity_accum_type):
    def torchTypeToStr(dt):
        t = str(dt)
        assert t in ["torch.float32", "torch.float16"]
        return t.split(".")[1]

    torch.manual_seed(42)
    model = OptimizerTestModel()

    # "Train" with learning rate of zero and check the loss remains the same.
    optimizer = poptorch.optim.SGD(model.parameters(),
                                   lr=0.01,
                                   use_combined_accum=False,
                                   accum_type=accum_type,
                                   velocity_accum_type=velocity_accum_type)
    model.setOptimizer(optimizer)
    model.run()
    testlog = helpers.LogChecker(capfd)
    testlog.assert_matches(
        "graph optimizer", "accumType=" + torchTypeToStr(accum_type),
        "firstOrderMomentumAccumType=" + torchTypeToStr(velocity_accum_type))


@pytest.mark.parametrize("use_combined_accum", (True, False))
def test_velocity_scaling_copy(use_combined_accum):
    torch.manual_seed(42)

    model = OptimizerTestModel()

    # "Train" with learning rate of zero and check the loss remains the same.
    optimizer = poptorch.optim.SGD(
        model.parameters(),
        lr=0.05,
        loss_scaling=0.05,
        velocity_scaling=128.1 if use_combined_accum else None,
        use_combined_accum=use_combined_accum)

    model.setOptimizer(optimizer)
    model.run()

    # Check copy.copy preserves optimizer PopTorch attributes
    o = copy.copy(optimizer)
    model.setOptimizer(o)
    model.run()


@pytest.mark.parametrize(
    "opt",
    {
        optim.SGD,
        poptorch.optim.SGD  #, optim.Adam, optim.AdamW, optim.RMSprop,
        #poptorch.optim.Adam, poptorch.optim.AdamW, AdamWNoBias,
        #poptorch.optim.RMSprop, poptorch.optim.LAMB, LAMBNoBias
    })
def test_optimizer_groups(opt):
    torch.manual_seed(42)

    model = OptimizerTestModel(num_groups=2)

    # Parameter is a soft copy by default oddly.
    weight1 = model.model.base_model[0].weight.clone()
    bias1 = model.model.base_model[0].bias.clone()
    weight2 = model.model.base_model[1].weight.clone()
    bias2 = model.model.base_model[1].bias.clone()

    def get_optims(run_time):

        first_group_lr = 0.0 if run_time == 0 else 0.1
        second_group_lr = 0.1 if run_time == 2 else 0.0

        if opt == poptorch.optim.SGD:
            return opt([{
                'params': model.model.base_model[0].parameters(),
                "lr": first_group_lr
            }, {
                'params': model.model.base_model[1].parameters(),
                "lr": second_group_lr
            }],
                       lr=0.1,
                       use_combined_accum=False)
        return opt([{
            'params': model.model.base_model[0].parameters(),
            "lr": first_group_lr
        }, {
            'params': model.model.base_model[1].parameters(),
            "lr": second_group_lr
        }],
                   lr=0.1)

    # Start the optimizer as zero for both groups.
    model.setOptimizer(get_optims(run_time=0))
    _, original_loss = model.run()
    for _ in range(0, 10):
        out, loss = model.run()

    weight1_post, bias1_post = model.model.base_model[0].parameters()
    weight2_post, bias2_post = model.model.base_model[1].parameters()

    # Nothing should have changed.
    helpers.assert_allequal(expected=weight1, actual=weight1_post)
    helpers.assert_allequal(expected=weight2, actual=weight2_post)
    helpers.assert_allequal(expected=bias1, actual=bias1_post)
    helpers.assert_allequal(expected=bias2, actual=bias2_post)

    # Check we have not trained the model
    assert loss == original_loss

    # Now update the optimizer to train just one weight
    model.setOptimizer(get_optims(run_time=1))
    _, original_loss = model.run()

    for _ in range(0, 10):
        out, loss = model.run()

    weight1_post, bias1_post = model.model.base_model[0].parameters()
    weight2_post, bias2_post = model.model.base_model[1].parameters()

    assert loss != original_loss

    assert not torch.equal(weight1, weight1_post)
    helpers.assert_allequal(expected=weight2, actual=weight2_post)
    assert not torch.equal(bias1, bias1_post)
    helpers.assert_allequal(expected=bias2, actual=bias2_post)

    # Now update the optimizer to train just both weight
    model.setOptimizer(get_optims(run_time=2))
    _, original_loss = model.run()

    # Actually try and train here.
    for _ in range(0, 2000):
        out, loss = model.run()

    weight2_post, bias2_post = model.model.base_model[1].parameters()

    assert not torch.equal(weight2, weight2_post)
    assert not torch.equal(bias2, bias2_post)

    # Check we've trained the model.
    assert torch.argmax(out) == model.label


def test_optimizer_groups_none_args():
    torch.manual_seed(42)

    model = OptimizerTestModel(num_groups=2)

    # Parameter is a soft copy by default oddly.
    weight1 = model.model.base_model[0].weight.clone()
    bias1 = model.model.base_model[0].bias.clone()
    weight2 = model.model.base_model[1].weight.clone()
    bias2 = model.model.base_model[1].bias.clone()

    # Start the optimizer as zero for both groups.
    model.setOptimizer(
        optim.AdamW([{
            'params': model.model.base_model[0].parameters(),
            "lr": 0.0
        }, {
            'params': model.model.base_model[1].parameters(),
            "lr": 0.0
        }],
                    lr=0.1))

    for _ in range(0, 10):
        model.run()

    weight1_post, bias1_post = model.model.base_model[0].parameters()
    weight2_post, bias2_post = model.model.base_model[1].parameters()

    # Nothing should have changed.
    helpers.assert_allequal(expected=weight1, actual=weight1_post)
    helpers.assert_allequal(expected=weight2, actual=weight2_post)
    helpers.assert_allequal(expected=bias1, actual=bias1_post)
    helpers.assert_allequal(expected=bias2, actual=bias2_post)


@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("DEBUG")
def test_optimizer_SGD_separate_velocity_scale_matched(capfd):
    model = OptimizerTestModel()

    optimizer = poptorch.optim.SGD(model.parameters(),
                                   loss_scaling=2.0,
                                   lr=1.0,
                                   use_combined_accum=False)
    model.setOptimizer(optimizer)
    model.run()
    testlog = helpers.LogChecker(capfd)
    testlog.assert_contains("lossScaling=2", "defaultVelocityScaling=2")


def test_optimizer_SGD_nesterov():
    torch.manual_seed(42)
    model = OptimizerTestModel()

    model.setOptimizer(
        optim.SGD(model.parameters(), nesterov=True, momentum=0.1, lr=0.001))
    model.run()


@pytest.mark.parametrize("opt",
                         helpers.onlyFirstIfReduced(poptorch_optimizers))
def test_optimizer_const(opt):
    torch.manual_seed(42)

    model = OptimizerTestModel()

    # Initialise the optimiser with the default loss_scaling value
    if opt == poptorch.optim.SGD:
        optimizer = opt(model.parameters(),
                        loss_scaling=1.0,
                        lr=1.0,
                        use_combined_accum=False)
    else:
        optimizer = opt(model.parameters(), loss_scaling=1.0, lr=1.0)

    model.setOptimizer(optimizer)
    model.run()

    optimizer.loss_scaling = 2.0
    model.setOptimizer(optimizer)
    model.run()


@pytest.mark.parametrize("opt",
                         helpers.onlyFirstIfReduced(poptorch_optimizers))
def test_optimizer_mark_as_variable(opt):
    torch.manual_seed(42)

    model = OptimizerTestModel()
    # Initialise the optimiser with the default loss_scaling value
    if opt == poptorch.optim.SGD:
        optimizer = opt(model.parameters(), lr=1.0, use_combined_accum=False)
    else:
        optimizer = opt(model.parameters(), lr=1.0)

    optimizer.variable_attrs.markAsVariable("loss_scaling")
    model.setOptimizer(optimizer)
    model.run()

    optimizer.loss_scaling = 2.0
    model.setOptimizer(optimizer)
    model.run()


@pytest.mark.parametrize("opt",
                         helpers.onlyFirstIfReduced(
                             [poptorch.optim.LAMB, LAMBNoBias]))
def test_lamb_max_weight_norm(opt):
    torch.manual_seed(42)
    model = OptimizerTestModel()

    optimizer = opt(model.parameters(), lr=0.01, max_weight_norm=100.0)
    model.setOptimizer(optimizer)
    _, original_loss = model.run()

    for _ in range(0, 1000):
        out, loss = model.run()

    # Check we have trained the "model"
    assert loss < original_loss
    assert loss < 0.03
    assert torch.argmax(out, dim=1) == model.label

    # Run from scratch with max_weight_norm disabled.
    model = OptimizerTestModel()
    optimizer = opt(model.parameters(), lr=0.01, max_weight_norm=None)

    # Train model again
    model.setOptimizer(optimizer)
    for _ in range(0, 1000):
        out, loss = model.run()

    # Model should have trained like normal
    assert loss < original_loss
    assert loss < 0.03
    assert torch.argmax(out, dim=1) == model.label


@helpers.printCapfdOnExit
@pytest.mark.parametrize("use_combined_accum", (True, False))
@helpers.overridePoptorchLogLevel("DEBUG")
def test_variable_groups(capfd, use_combined_accum):
    model = OptimizerTestModel(num_groups=2)

    # Make sure all groups have the default values, and the values are not (const)
    params = [{
        "params": model.model.base_model[0].parameters()
    }, {
        "params": model.model.base_model[1].parameters()
    }]
    o = poptorch.optim.SGD(
        params,
        lr=0.01,
        loss_scaling=2.0,
        velocity_scaling=2.0 if use_combined_accum else None,
        use_combined_accum=use_combined_accum)
    model.setOptimizer(o)
    model.run()
    testlog = helpers.LogChecker(capfd)
    testlog.assert_contains("graph optimizer with SGD",
                            "defaultLearningRate=0.01,",
                            "defaultVelocityScaling=2,", "lossScaling=2")

    testlog.assert_contains("group 0 optimizer with SGD", "learningRate=0.01,",
                            "velocityScaling=2,")
    testlog.assert_contains("group 1 optimizer with SGD", "learningRate=0.01,",
                            "velocityScaling=2,")

    # Make sure the loss_scaling can be changed, and individual velocityScaling can be set.
    o.loss_scaling = 4.0
    o.param_groups[1]["velocity_scaling"] = 4.0  # onl for combined variant
    o.param_groups[0][
        "loss_scaling"] = 4.0  # doesn't exist: loss scaling is not a group attribute
    model.setOptimizer(o)
    model.run()
    testlog = helpers.LogChecker(capfd)
    testlog.assert_contains("Ignoring unexpected group 0 attribute",
                            "'loss_scaling'")
    if use_combined_accum:
        testlog.assert_contains("graph optimizer with SGD",
                                "defaultLearningRate=0.01,",
                                "defaultVelocityScaling=2,", "lossScaling=4")
        testlog.assert_contains("group 0 optimizer with SGD",
                                "learningRate=0.01,", "velocityScaling=2,")
    else:
        testlog.assert_contains("Ignoring unexpected group 1 attribute",
                                "'velocity_scaling'")
        testlog.assert_contains("group 0 optimizer with SGD",
                                "learningRate=0.01,", "velocityScaling=4,")

    testlog.assert_contains("group 1 optimizer with SGD", "learningRate=0.01,",
                            "velocityScaling=4,")

    # Make sure the the groups default to the new optimizer's default velocityScaling, manually set lr for both groups
    params = [{
        "params": model.model.base_model[0].parameters()
    }, {
        "params": model.model.base_model[1].parameters()
    }]
    o = poptorch.optim.SGD(
        params,
        lr=0.01,
        loss_scaling=1.0,
        velocity_scaling=3.0 if use_combined_accum else None,
        use_combined_accum=use_combined_accum)
    o.lr = 0.5  # doesn't exit
    o.defaults["lr"] = 0.7
    o.param_groups[0]["lr"] = 0.0
    o.param_groups[1]["lr"] = 1.0
    model.setOptimizer(o)
    model.run()
    testlog = helpers.LogChecker(capfd)
    testlog.assert_contains("Ignoring unexpected optimizer attribute", "'lr'")

    if use_combined_accum:
        testlog.assert_contains("graph optimizer with SGD",
                                "defaultLearningRate=0.7,",
                                "defaultVelocityScaling=3,", "lossScaling=1")
        testlog.assert_contains("group 0 optimizer with SGD",
                                "learningRate=0,", "velocityScaling=3,")
        testlog.assert_contains("group 1 optimizer with SGD",
                                "learningRate=1,", "velocityScaling=3,")
    else:
        testlog.assert_contains("graph optimizer with SGD",
                                "defaultLearningRate=0.7,",
                                "defaultVelocityScaling=1,", "lossScaling=1")
        testlog.assert_contains("group 0 optimizer with SGD",
                                "learningRate=0,", "velocityScaling=1,")
        testlog.assert_contains("group 1 optimizer with SGD",
                                "learningRate=1,", "velocityScaling=1,")


@helpers.printCapfdOnExit
@pytest.mark.parametrize(
    "opt",
    helpers.onlyFirstIfReduced((
        (poptorch.optim.SGD, (("momentum", 0.0), ("dampening", 0.0),
                              ("weight_decay", 0.0))),
        (poptorch.optim.Adam, (("betas", (0.9, 0.999)), ("eps", 1e-08),
                               ("weight_decay", 0.0), ("amsgrad", False))),
        (poptorch.optim.AdamW, (("betas", (0.9, 0.999)), ("eps", 1e-08),
                                ("weight_decay", 0.01), ("amsgrad", False))),
        (poptorch.optim.RMSprop, (("momentum", 0.0), ("alpha", 0.99),
                                  ("eps", 1e-08), ("weight_decay", 0.0))),
    )))
@helpers.overridePoptorchLogLevel("DEBUG")
# pylint: disable=too-many-statements
def test_variable_default(opt, capfd):
    def toCamelCase(string):
        """Convert a snake case string (Pytorch) to camel case (Popart)"""
        words = string.split("_")
        return words[0] + "".join(w.capitalize() for w in words[1:])

    def toPopartName(name, default):
        if name == "lr":
            name = "learning_rate"
        # amsgrad doesn't get passed to the backend
        if name in ["amsgrad"]:
            return []
        if name == "betas":
            return toPopartName("beta1", default) + toPopartName(
                "beta2", default)
        if default:
            name = "default_" + name
        return [toCamelCase(name)]

    def createExpr(attr, is_const=True):
        const_expr = r" \(const\)"
        if not is_const:
            const_expr = "(?!" + const_expr + ")"

        return r"%s=[^ ,]+%s" % (attr, const_expr)

    def genRegexp(attrs, default=False, is_const=False):
        if isinstance(attrs, str):
            attrs = [attrs]
        exprs = []
        for a in attrs:
            for n in toPopartName(a, default):
                exprs.append(createExpr(n, is_const))
        return exprs

    # All the attribute values in "opt" are the default pytorch values which
    # means if the user instantiate a pytorch optimizer with them, we'll
    # consider all these attributes as constant.
    # However if a poptorch optimizer is used then they will all be considered
    # as variable because they were explicitly passed to the constructor.
    poptorch_opt, opt_args_tuple = opt
    opt_args = dict(opt_args_tuple)
    pytorch_opt = poptorch_opt.__bases__[1]  # Retrieve the upstream type

    # Learning rate is a special case: it's always variable so handle it separately.
    attrs = list(opt_args.keys())

    # Test the torch Optimizer: check all the attributes are set to constant by default
    model = OptimizerTestModel()
    optimizer = pytorch_opt(model.parameters(), lr=1.0, **opt_args)
    model.setOptimizer(optimizer)
    model.run()
    testlog = helpers.LogChecker(capfd)
    testlog.assert_matches("graph optimizer",
                           *genRegexp(attrs, default=True, is_const=True),
                           *genRegexp("lr", default=True, is_const=False))
    testlog.assert_matches("group 0 optimizer", *genRegexp(attrs,
                                                           is_const=True),
                           *genRegexp("lr", is_const=False))

    # Create a default pytorch optimizer (It should be identical to the previous one)
    optimizer = pytorch_opt(model.parameters(), lr=1.0)
    model.setOptimizer(optimizer)
    model.run()
    testlog = helpers.LogChecker(capfd)
    # As the optimizer is identical it shouldn't trigger any update in the backend
    testlog.assert_no_matches("graph optimizer")
    testlog.assert_no_matches("group 0 optimizer")

    # Create a default poptorch optimizer (As we don't explicitly specify any attribute they will all be considered as constant)
    if poptorch_opt == poptorch.optim.SGD:
        optimizer = poptorch_opt(model.parameters(),
                                 lr=1.0,
                                 use_combined_accum=False)
    else:
        optimizer = poptorch_opt(model.parameters(), lr=1.0)

    model.setOptimizer(optimizer)
    model.run()

    testlog = helpers.LogChecker(capfd)
    # As the optimizer is identical it shouldn't trigger any update in the backend
    testlog.assert_no_matches("graph optimizer")
    testlog.assert_no_matches("group 0 optimizer")

    # Create a poptorch optimizer and set all the attributes manually: they should all be marked as variable
    # So let's now manually mark them as constant (This should result in the same optimizer as the default one)
    if poptorch_opt == poptorch.optim.SGD:
        optimizer = poptorch_opt(model.parameters(),
                                 lr=1.0,
                                 use_combined_accum=False,
                                 **opt_args)
    else:
        optimizer = poptorch_opt(model.parameters(), lr=1.0, **opt_args)

    for attr in opt_args.keys():
        assert not optimizer.variable_attrs.isConstant(attr)
        optimizer.variable_attrs.markAsConstant(attr)

    model.setOptimizer(optimizer)
    model.run()
    # As the optimizer is identical it shouldn't trigger any update in the backend
    testlog.assert_no_matches("graph optimizer")
    testlog.assert_no_matches("group 0 optimizer")

    # Test the poptorch Optimizer: check all the manually set attributes are set to variable by default
    # Create a new model as the optimizers would otherwise mismatch
    model = OptimizerTestModel()

    if poptorch_opt == poptorch.optim.SGD:
        optimizer = poptorch_opt(model.parameters(),
                                 lr=1.0,
                                 **opt_args,
                                 use_combined_accum=False)
    else:
        optimizer = poptorch_opt(model.parameters(), lr=1.0, **opt_args)

    model.setOptimizer(optimizer)
    model.run()
    testlog = helpers.LogChecker(capfd)
    testlog.assert_matches("graph optimizer",
                           *genRegexp(attrs, default=True, is_const=False),
                           *genRegexp("lr", default=True, is_const=False))
    testlog.assert_matches("group 0 optimizer",
                           *genRegexp(attrs, is_const=False),
                           *genRegexp("lr", is_const=False))

    # Check the values can actually change
    new_opts = {}
    for k, v in opt_args.items():
        if isinstance(v, float):
            new_opts[k] = v + 0.5
        elif isinstance(v, tuple):
            new_opts[k] = tuple(elt / 2.0 for elt in v)
        else:
            new_opts[k] = v

    if poptorch_opt == poptorch.optim.SGD:
        optimizer = poptorch_opt(model.parameters(),
                                 lr=1.0,
                                 use_combined_accum=False,
                                 **new_opts)
    else:
        optimizer = poptorch_opt(model.parameters(), lr=1.0, **new_opts)

    model.setOptimizer(optimizer)
    model.run()
    testlog = helpers.LogChecker(capfd)
    testlog.assert_matches("graph optimizer",
                           *genRegexp(attrs, default=True, is_const=False),
                           *genRegexp("lr", default=True, is_const=False))
    testlog.assert_matches("group 0 optimizer",
                           *genRegexp(attrs, is_const=False),
                           *genRegexp("lr", is_const=False))

    # Check we can manually mark attributes as variable
    if poptorch_opt == poptorch.optim.SGD:
        optimizer = poptorch_opt(model.parameters(),
                                 lr=1.0,
                                 use_combined_accum=False)
    else:
        optimizer = poptorch_opt(model.parameters(), lr=1.0)

    for attr in opt_args.keys():
        assert optimizer.variable_attrs.isConstant(attr)
        optimizer.variable_attrs.markAsVariable(attr)
    model.setOptimizer(optimizer)
    model.run()
    testlog = helpers.LogChecker(capfd)
    testlog.assert_matches("graph optimizer",
                           *genRegexp(attrs, default=True, is_const=False),
                           *genRegexp("lr", default=True, is_const=False))
    testlog.assert_matches("group 0 optimizer",
                           *genRegexp(attrs, is_const=False),
                           *genRegexp("lr", is_const=False))


@pytest.mark.parametrize("reduction",
                         helpers.onlyFirstIfReduced(
                             (poptorch.ReductionType.Sum,
                              poptorch.ReductionType.Mean)))
def test_gradient_accum(reduction):
    torch.manual_seed(42)

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            layers = [torch.nn.Linear(10, 10) for _ in range(3)]

            self.model = torch.nn.Sequential(*layers)
            self.loss = torch.nn.CrossEntropyLoss()

        def forward(self, x, target):
            fwd = self.model(x)
            return fwd, self.loss(fwd, target)

    accum = 20

    opts = poptorch.Options()
    opts.Training.gradientAccumulation(accum)
    opts.Training.accumulationAndReplicationReductionType(reduction)

    model = Model()

    poptorch_model = poptorch.trainingModel(model, options=opts)

    ins = torch.randn([1, 10]).expand(accum, 10)
    target = torch.randint(0, 10, size=[1]).expand(accum)

    _, loss = poptorch_model(ins, target)

    for _ in range(0, 500):
        _, loss = poptorch_model(ins, target)

    # Check we have trained the "model"
    assert loss < 0.03


@pytest.mark.parametrize("reduction",
                         helpers.onlyFirstIfReduced(
                             (poptorch.ReductionType.Sum,
                              poptorch.ReductionType.Mean)))
def test_gradient_accum_new_api(reduction):
    torch.manual_seed(42)

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            layers = [torch.nn.Linear(10, 10) for _ in range(3)]

            self.model = torch.nn.Sequential(*layers)
            self.loss = torch.nn.CrossEntropyLoss()

        def forward(self, x, target):
            fwd = self.model(x)
            return fwd, self.loss(fwd, target)

    accum = 20

    opts = poptorch.Options()
    opts.Training.gradientAccumulation(accum)
    opts.Training.accumulationAndReplicationReductionType(reduction)

    model = Model()

    poptorch_model = poptorch.trainingModel(model, options=opts)

    ins = torch.randn([1, 10]).expand(accum, 10)
    target = torch.randint(0, 10, size=[1]).expand(accum)

    _, loss = poptorch_model(ins, target)

    for _ in range(0, 500):
        _, loss = poptorch_model(ins, target)

    # Check we have trained the "model"
    assert loss < 0.03


@helpers.printCapfdOnExit
@pytest.mark.parametrize("use_combined_accum", (True, False))
@helpers.overridePoptorchLogLevel("WARN"
                                  )  # We only want warnings for this test
def test_extra_attributes(capfd, use_combined_accum):
    model = OptimizerTestModel(num_groups=2)

    # Make sure all groups have the default values, and the values are not (const)
    params = [{
        "params": model.model.base_model[0].parameters()
    }, {
        "params": model.model.base_model[1].parameters()
    }]
    o = poptorch.optim.SGD(
        params,
        lr=0.01,
        loss_scaling=2.0,
        velocity_scaling=2.0 if use_combined_accum else None,
        use_combined_accum=use_combined_accum)
    model.setOptimizer(o)
    model.run()
    o.step = 0
    o.param_groups[0]["initial_lr"] = 0.1
    o.param_groups[1]["initial_lr"] = 0.1
    model.setOptimizer(o)
    model.run()
    testlog = helpers.LogChecker(capfd)
    testlog.assert_matches("unexpected optimizer attribute")
    testlog.assert_matches(r"unexpected group \d attribute")
    # loss_scaling = 3.0: Make sure optimizer is different to trigger update
    o.loss_scaling = 3.0
    model.setOptimizer(o)
    model.run()
    # Ensure warnings are printed only once
    testlog = helpers.LogChecker(capfd)
    testlog.assert_no_matches("unexpected optimizer attribute")
    testlog.assert_no_matches(r"unexpected group \d attribute")


@helpers.printCapfdOnExit
@pytest.mark.parametrize("use_combined_accum", (True, False))
@helpers.overridePoptorchLogLevel("WARN"
                                  )  # We only want warnings for this test
def test_extra_attributes2(capfd, use_combined_accum):

    opts = poptorch.Options()
    opts.relaxOptimizerAttributesChecks()
    model = OptimizerTestModel(num_groups=2, options=opts)
    # Make sure all groups have the default values, and the values are not (const)
    params = [{
        "params": model.model.base_model[0].parameters()
    }, {
        "params": model.model.base_model[1].parameters()
    }]
    o = poptorch.optim.SGD(
        params,
        lr=0.01,
        loss_scaling=2.0,
        velocity_scaling=2.0 if use_combined_accum else None,
        use_combined_accum=use_combined_accum)
    model.setOptimizer(o)
    model.run()
    o.step = 0
    o.param_groups[0]["initial_lr"] = 0.1
    o.param_groups[1]["initial_lr"] = 0.1
    model.setOptimizer(o)
    model.run()
    testlog = helpers.LogChecker(capfd)
    testlog.assert_no_matches("unexpected optimizer attribute")
    testlog.assert_no_matches(r"unexpected group \d attribute")


@helpers.printCapfdOnExit
@pytest.mark.parametrize("use_combined_accum", (True, False))
@helpers.overridePoptorchLogLevel("WARN"
                                  )  # We only want warnings for this test
def test_extra_attributes3(capfd, use_combined_accum):
    model = OptimizerTestModel(num_groups=2)
    # Make sure all groups have the default values, and the values are not (const)
    params = [{
        "params": model.model.base_model[0].parameters()
    }, {
        "params": model.model.base_model[1].parameters()
    }]
    o = poptorch.optim.SGD(
        params,
        lr=0.01,
        loss_scaling=2.0,
        velocity_scaling=2.0 if use_combined_accum else None,
        use_combined_accum=use_combined_accum)
    o.step = 0
    o.param_groups[0]["initial_lr"] = 0.1
    o.param_groups[1]["initial_lr"] = 0.1
    model.setOptimizer(o)
    model.run()
    # If extra attributes are added before the first run
    # they shouldn't trigger any warning
    testlog = helpers.LogChecker(capfd)
    testlog.assert_no_matches("unexpected optimizer attribute")
    testlog.assert_no_matches(r"unexpected group \d attribute")

    # loss_scaling = 4.0: Make sure optimizer is different to trigger update
    o.loss_scaling = 4.0
    # initial_lr is a group attribute: should trigger a warning.
    o.initial_lr = 0.2
    # If they're added later then they should print a warning
    model.setOptimizer(o)
    model.run()
    testlog = helpers.LogChecker(capfd)
    testlog.assert_matches("unexpected optimizer attribute")
    testlog.assert_no_matches(r"unexpected group \d attribute")


@pytest.mark.parametrize("use_tf_variant", [True, False])
def test_rmsprop_tf_variant(use_tf_variant):
    torch.manual_seed(0)
    # Make sure the TF flag is propagated correctly by comparing the
    # results of TF and non-TF versions.
    weight = torch.randn(10, 10)
    bias = torch.randn(10)
    input = torch.randn(1, 10)
    label = torch.randint(0, 10, [1])

    model_pt = OptimizerTestModel()
    model_pt.model.base_model.weight = torch.nn.Parameter(
        weight.detach().clone())
    model_pt.model.base_model.bias = torch.nn.Parameter(bias.detach().clone())
    model_pt.input = input.detach().clone()
    model_pt.label = label.detach().clone()
    optimizer_pt = poptorch.optim.RMSprop(model_pt.parameters(), lr=0.02)
    model_pt.setOptimizer(optimizer_pt)

    model_tf = OptimizerTestModel()
    model_tf.model.base_model.weight = torch.nn.Parameter(
        weight.detach().clone())
    model_tf.model.base_model.bias = torch.nn.Parameter(bias.detach().clone())
    model_tf.input = input.detach().clone()
    model_tf.label = label.detach().clone()
    optimizer_tf = poptorch.optim.RMSprop(model_tf.parameters(),
                                          lr=0.02,
                                          use_tf_variant=use_tf_variant)
    model_tf.setOptimizer(optimizer_tf)

    helpers.assert_allequal(actual=model_pt.model.base_model.weight.data,
                            expected=model_tf.model.base_model.weight.data)
    helpers.assert_allequal(actual=model_pt.model.base_model.bias.data,
                            expected=model_tf.model.base_model.bias.data)

    for _ in range(5):
        out_pt, loss_pt = model_pt.run()
        out_tf, loss_tf = model_tf.run()

    if use_tf_variant:
        assert not torch.allclose(model_pt.model.base_model.weight.data,
                                  model_tf.model.base_model.weight.data)
        assert not torch.allclose(out_pt, out_tf)
        assert not torch.allclose(loss_pt, loss_tf)
    else:
        helpers.assert_allequal(
            actual=model_pt.model.base_model.weight.detach().clone(),
            expected=model_tf.model.base_model.weight.detach().clone())
        helpers.assert_allequal(actual=out_pt, expected=out_tf)
        helpers.assert_allequal(actual=loss_pt, expected=loss_tf)


@pytest.mark.parametrize("opt", all_optimizers)
def test_optimizer_results(opt):
    torch.manual_seed(42)

    class Stepper:
        def __init__(self, model, lr, optimizer):
            self.lr = lr
            self.setup_cpu(model, optimizer)
            self.setup_ipu(model, optimizer)
            self.check_parameters()

        def setup_cpu(self, model, optimizer):
            self.cpu_model = copy.deepcopy(model)
            self.optimizer = optimizer(self.cpu_model.parameters(), lr=self.lr)

        def setup_ipu(self, model, optimizer):
            self.ipu_model = copy.deepcopy(model)
            ipu_optimizer = optimizer(self.ipu_model.parameters(), lr=self.lr)
            self.training_model = poptorch.trainingModel(
                self.ipu_model, optimizer=ipu_optimizer)

        def check_parameters(self):
            for cpu, ipu in zip(self.cpu_model.named_parameters(),
                                self.ipu_model.named_parameters()):
                cpu = cpu[1]
                ipu = ipu[1]
                helpers.assert_allclose(actual=ipu, expected=cpu)

        def cpu_step(self, batch):
            self.optimizer.zero_grad()
            _, loss = self.cpu_model(batch)
            loss = loss.sum()
            loss.backward()
            self.optimizer.step()
            return loss

        def ipu_step(self, batch):
            _, loss = self.training_model(batch)
            return loss

    num_samples = 10
    X = torch.rand(num_samples)
    lr = 0.01
    num_steps = 10

    cpu_loss = torch.empty(num_steps)
    ipu_loss = torch.empty(num_steps)

    stepper = Stepper(helpers.ModelWithWeights(torch.nn.LogSoftmax(), X.shape),
                      lr=lr,
                      optimizer=opt)

    for i in range(num_steps):
        cpu_loss[i] = stepper.cpu_step((X, ))
        ipu_loss[i] = stepper.ipu_step((X, ))

        stepper.check_parameters()

    helpers.assert_allclose(expected=cpu_loss,
                            actual=ipu_loss,
                            atol=1e-5,
                            rtol=1e-5)


@pytest.mark.parametrize("opt", [(optim.SGD, poptorch.optim.SGD),
                                 (optim.Adam, poptorch.optim.Adam),
                                 (optim.AdamW, poptorch.optim.AdamW)],
                         ids=['SGD', 'Adam', 'AdamW'])
def test_gradient_clipping(opt):
    torch.manual_seed(42)
    max_norm = 0.001

    class Stepper:
        def __init__(self, model, lr, optimizer):
            self.lr = lr
            self.original_model = model
            self.setup_torch(model, optimizer[0])
            self.setup_poptorch(model, optimizer[1])
            self.check_parameters()

        def setup_torch(self, model, optimizer):
            self.torch_model = copy.deepcopy(model)
            self.optimizer = optimizer(self.torch_model.parameters(),
                                       lr=self.lr)

        def setup_poptorch(self, model, optimizer):
            self.ipu_model = copy.deepcopy(model)
            ipu_optimizer = optimizer(self.ipu_model.parameters(),
                                      lr=self.lr,
                                      max_grad_norm=max_norm)
            self.training_model = poptorch.trainingModel(
                self.ipu_model, optimizer=ipu_optimizer)

        def check_parameters(self):
            for expected, actual in zip(
                    self.torch_model.named_parameters(),
                    self.training_model.named_parameters()):
                expected = expected[1]
                actual = actual[1]
                helpers.assert_allclose(actual=actual,
                                        expected=expected,
                                        atol=1e-5,
                                        rtol=1e-5)

        def torch_step(self, batch):
            self.optimizer.zero_grad()
            _, loss = self.torch_model(batch)
            loss = loss.sum()
            loss.backward()

            torch.nn.utils.clip_grad_norm_(self.torch_model.parameters(),
                                           max_norm)

            self.optimizer.step()
            return loss

        def poptorch_step(self, batch):
            _, loss = self.training_model(batch)
            return loss

    num_samples = 10
    X = torch.randn(num_samples)
    lr = 0.01
    num_steps = 10

    torch_loss = torch.empty(num_steps)
    poptorch_loss = torch.empty(num_steps)

    stepper = Stepper(helpers.ModelWithWeights(torch.nn.LogSoftmax(), X.shape),
                      lr=lr,
                      optimizer=opt)

    for i in range(num_steps):
        torch_loss[i] = stepper.torch_step((X, ))
        poptorch_loss[i] = stepper.poptorch_step((X, ))

        stepper.check_parameters()

    helpers.assert_allclose(expected=torch_loss,
                            actual=poptorch_loss,
                            atol=1e-5,
                            rtol=1e-5)


# TODO(T53152): remove this test.
def test_gradient_clipping_with_pipelining():
    torch.manual_seed(0)
    opts = poptorch.Options()
    opts.Training.gradientAccumulation(3)

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.w0 = poptorch.BeginBlock(torch.nn.Linear(3, 3),
                                          "w0",
                                          ipu_id=0)
            self.w1 = poptorch.BeginBlock(torch.nn.Linear(3, 3),
                                          "w1",
                                          ipu_id=1)
            self.loss = torch.nn.NLLLoss(reduction="mean")

        def forward(self, x, y):
            x = self.w0(x)
            x = self.w1(x)
            loss = self.loss(x, y)
            return x, loss

    model = Model()
    optimizer = poptorch.optim.SGD(
        model.parameters(),
        lr=0.01,
        max_grad_norm=0.001,
    )
    poptorch_model = poptorch.trainingModel(model, opts, optimizer=optimizer)
    poptorch_model(torch.randn((15, 3, 3)), torch.randint(0, 1, (15, 3)))


@pytest.mark.parametrize("optim", poptorch_optimizers)
def test_read_ipu_state(optim):
    torch.manual_seed(42)
    input = torch.randn(3)
    # A simple model with weights and a loss function
    model = helpers.ModelWithWeights(lambda x: x, input.shape)

    lr = 0.05
    wd = 0.025
    ls = 0.75

    optimizer = optim(model.parameters(),
                      lr=lr,
                      weight_decay=wd,
                      loss_scaling=ls)
    training_model = poptorch.trainingModel(model, optimizer=optimizer)

    # Before the model is compiled, the state_dict should be empty
    state = optimizer.state_dict()
    assert_is_ipu_optimizer_state(state, should_be_empty=True)

    # Compiling should populate the state_dict
    training_model.compile((input, ))
    s0 = optimizer.state_dict()
    assert_is_ipu_optimizer_state(s0, should_be_empty=False)

    sgd_param_keys = [
        "scaledLearningRate0___specific___lin.bias",
        "scaledLearningRate0___specific___lin.weight",
        "weightDecayScaleFactor0___specific___lin.bias",
        "weightDecayScaleFactor0___specific___lin.weight"
    ]
    non_sgd_param_keys = [
        "learningRate___specific___lin.bias",
        "learningRate___specific___lin.weight",
        "weightDecay___specific___lin.bias",
        "weightDecay___specific___lin.weight"
    ]

    # Check that shared keys are present and user provided values are read
    # back correctly
    if isinstance(optimizer, torch.optim.SGD):
        for k in sgd_param_keys:
            assert k in s0["ipu_param"].keys()

        # weightDecayScaleFactor0 =
        # 1 - lr * (1 - dm) * wd, dm = 0
        wdsf0 = 1 - lr * wd
        helpers.assert_allclose(
            actual=s0["ipu_param"]
            ["weightDecayScaleFactor0___specific___lin.bias"],
            expected=torch.tensor(wdsf0))

        # scaledLearningRate0 =
        # lr *  (1 - dm) / ls, dm = 0
        slr0 = lr / ls
        helpers.assert_allclose(actual=s0["ipu_param"]
                                ["scaledLearningRate0___specific___lin.bias"],
                                expected=torch.tensor(slr0))
    else:
        # Only non-SGD optimisers have state tensors
        state_keys = ["Accl1___lin.weight", "Accl1___lin.bias"]
        for k in non_sgd_param_keys:
            assert k in s0["ipu_param"].keys()
        for k in state_keys:
            assert k in s0["ipu_state"].keys()

        helpers.assert_allclose(
            actual=s0["ipu_param"]["learningRate___specific___lin.bias"],
            expected=torch.tensor(lr))
        helpers.assert_allclose(
            actual=s0["ipu_param"]["weightDecay___specific___lin.bias"],
            expected=torch.tensor(wd))

        # Run the model, get the updated state dict and check optimiser state tensors have changed
        training_model((input, ))
        s1 = optimizer.state_dict()
        assert_is_ipu_optimizer_state(s1, should_be_empty=False)
        assert not all(
            torch.equal(s0["ipu_state"][k], s1["ipu_state"][k])
            for k in s0["ipu_state"].keys())

    helpers.assert_allclose(actual=s0["ipu_param"]["lossScaling_FLOAT"],
                            expected=torch.tensor(ls))


@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("DEBUG")
def test_read_ipu_state_cached(caplog, capfd):
    input = torch.ones(3)
    # A simple model with weights and a loss function
    model = helpers.ModelWithWeights(lambda x: x, input.shape)
    optimizer = poptorch.optim.SGD(model.parameters(), lr=0.0)

    training_model = poptorch.trainingModel(model, optimizer=optimizer)

    training_model.compile((input, ))
    # Compilation should trigger an optimiser state IPU->host copy
    state = optimizer.state_dict()
    assert_is_ipu_optimizer_state(state, should_be_empty=False)

    log = helpers.LogChecker(capfd)
    log.assert_matches("Writing optimiser state tensors from IPU to host.")

    # The second invocation should use the cached state dict, since
    # the internal optimiser state hasn't changed
    state = optimizer.state_dict()
    assert_is_ipu_optimizer_state(state, should_be_empty=False)
    assert "Using cached optimiser state dict" in caplog.text


@unittest.mock.patch.dict("os.environ", helpers.disableAllModels())
def test_read_ipu_state_offline():
    input = torch.ones(3)
    # A simple model with weights and a loss function
    model = helpers.ModelWithWeights(lambda x: x, input.shape)
    optimizer = poptorch.optim.SGD(model.parameters(), lr=0.0)

    opts = poptorch.Options()
    opts.useOfflineIpuTarget()
    training_model = poptorch.trainingModel(model, opts, optimizer=optimizer)

    training_model.compile((input, ))
    state = optimizer.state_dict()
    assert_is_ipu_optimizer_state(state, should_be_empty=True)


@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("DEBUG")
@pytest.mark.parametrize("optim", [poptorch.optim.SGD, torch.optim.SGD])
def test_read_ipu_state_on_detach(caplog, capfd, optim):
    input = torch.ones(3)
    # A simple model with weights and a loss function
    model = helpers.ModelWithWeights(lambda x: x, input.shape)
    optimizer = optim(model.parameters(), lr=0.0)

    training_model = poptorch.trainingModel(model, optimizer=optimizer)

    training_model.compile((input, ))
    training_model.detachFromDevice()

    # Detach should trigger an optimiser state IPU->host copy for PopTorch optimizers
    log = helpers.LogChecker(capfd)
    if isinstance(optimizer, poptorch.optim.Optimizer):
        log.assert_matches("Writing optimiser state tensors from IPU to host.")
    else:
        log.assert_no_matches(
            "Writing optimiser state tensors from IPU to host.")

    # The second invocation should use the cached state dict, since
    # the internal optimiser state hasn't changed
    state = optimizer.state_dict()
    log = helpers.LogChecker(caplog.text)
    if isinstance(optimizer, poptorch.optim.Optimizer):
        log.assert_matches("Using cached optimiser state dict")
    else:
        log.assert_no_matches("Using cached optimiser state dict")

    optimizer.load_state_dict(state)

    training_model.attachToDevice()
    # Detach should trigger an optimiser state IPU->host copy for PopTorch optimizers
    log = helpers.LogChecker(capfd)
    if isinstance(optimizer, poptorch.optim.Optimizer):
        log.assert_matches(
            "Writing optimiser state tensors from host to IPU memory")
    else:
        log.assert_no_matches(
            "Writing optimiser state tensors from host to IPU memory")


@pytest.mark.parametrize("optim", poptorch_optimizers)
@pytest.mark.parametrize("incomplete_state", [True, False])
def test_write_ipu_state(optim, incomplete_state):
    torch.manual_seed(42)
    input = torch.randn(3)
    # A simple model with weights and a loss function
    model = helpers.ModelWithWeights(lambda x: x, input.shape)

    # SGD requires LR to be specified but the value doesn't matter
    optimizer = optim(model.parameters(), lr=0.0)
    # Hacky way to make sure all the attributes are set to variable.
    optimizer.variable_attrs._variable_attributes = copy.deepcopy(  # pylint: disable=protected-access
        optimizer.variable_attrs._allowed_attributes)  # pylint: disable=protected-access

    training_model = poptorch.trainingModel(model, optimizer=optimizer)

    # Compiling should populate the state_dict
    training_model.compile((input, ))
    # The initial optimiser state
    s0 = optimizer.state_dict()

    deleted_param = None
    deleted_state = None
    if incomplete_state:
        # delete the first param
        deleted_param = next(iter(s0["ipu_param"].items()))
        del s0["ipu_param"][deleted_param[0]]
        deleted_state = next(iter(s0["ipu_state"].items()))
        del s0["ipu_state"][deleted_state[0]]

    # Just set values randomly so we can check they changed
    for k, v in s0["ipu_param"].items():
        s0["ipu_param"][k] = torch.randn_like(v)
    for k, v in s0["ipu_state"].items():
        s0["ipu_state"][k] = torch.randn_like(v)

    # Load the modified state dict
    optimizer.load_state_dict(s0)

    # Read it back into a new dict
    s1 = optimizer.state_dict()

    # Check that the values read back match the ones set
    for k, v in s0["ipu_param"].items():
        helpers.assert_allequal(actual=s1["ipu_param"][k], expected=v)
    if deleted_param:
        # At that point we haven't used the new optimizer yet so the deleted keys haven't been restored.
        assert deleted_param[0] not in s1["ipu_param"]

    for k, v in s0["ipu_state"].items():
        helpers.assert_allequal(actual=s1["ipu_state"][k], expected=v)
    if deleted_state:
        # At that point we haven't used the new optimizer yet so the deleted keys haven't been restored.
        assert deleted_state[0] not in s1["ipu_state"]

    # Use the model and check the two states have been merged.
    training_model((input, ))

    s1 = optimizer.state_dict()

    # Check that the values read back match the ones set
    for k, v in s0["ipu_param"].items():
        helpers.assert_allequal(actual=s1["ipu_param"][k], expected=v)
    if deleted_param:
        helpers.assert_allequal(actual=s1["ipu_param"][deleted_param[0]],
                                expected=deleted_param[1])

    # Using the model will have changed the state, so we can only check the values have changed.
    for k, v in s0["ipu_state"].items():
        assert not torch.allclose(v, s1["ipu_state"][k])
    if deleted_state:
        assert not torch.allclose(deleted_state[1],
                                  s1["ipu_state"][deleted_state[0]])


def test_write_ipu_state_from_cpu():
    input = torch.ones(2)
    lin = torch.nn.Linear(2, 1)
    optimizer = torch.optim.Adam(lin.parameters())

    # Perform a CPU training step to populate torch optimiser state
    out = lin(input)
    out.backward()
    optimizer.step()

    pop_optimizer = poptorch.optim.Adam(lin.parameters())
    # Try to load the CPU optimiser state onto the IPU
    with pytest.raises(
            RuntimeError,
            match="Only IPU optimizer states can be loaded onto the IPU."):
        pop_optimizer.load_state_dict(optimizer.state_dict())


@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("DEBUG")
def test_write_ipu_state_before_override(capfd):
    input = torch.ones(2)
    model = helpers.ModelWithWeights(lambda x: x, input.shape)
    optimizer = poptorch.optim.Adam(model.parameters())
    training_model = poptorch.trainingModel(model, optimizer=optimizer)

    # Compile and run the model, get the state dict
    training_model((input, ))
    s1 = optimizer.state_dict()

    # destroy model so it can be rewrapped
    training_model.destroy()

    # Create a new optimiser and load the state dict
    new_optimizer = poptorch.optim.Adam(model.parameters())
    new_optimizer.load_state_dict(s1)

    # Compile a new model with the new loaded optimiser
    new_training_model = poptorch.trainingModel(model, optimizer=new_optimizer)
    new_training_model.compile((input, ))

    # The state read back should match the initial state
    s2 = new_optimizer.state_dict()
    for k, v in s1["ipu_state"].items():
        helpers.assert_allequal(actual=s2["ipu_state"][k], expected=v)

    # Confirm that an optimiser state IPU->host copy actually took place
    log = helpers.LogChecker(capfd)
    log.assert_matches("Writing optimiser state tensors from IPU to host.")


@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("DEBUG")
def test_LR_scheduler(capfd):
    input = torch.ones(2)
    model = helpers.ModelWithWeights(lambda x: x, input.shape)
    optimizer = poptorch.optim.Adam(model.parameters(), lr=1.0)
    # Halve the LR after each training step
    scheduler = ExponentialLR(optimizer, 0.5)
    training_model = poptorch.trainingModel(model, optimizer=optimizer)

    # Compile and run the model
    training_model((input, ))

    log = helpers.LogChecker(capfd)
    # Initial optimizer upload
    log.assert_matches("Updating group 0 optimizer")

    # Step the scheduler for the next epoch
    # lr: 1.0 -> 0.5
    scheduler.step()

    # Set the new LR
    training_model.setOptimizer(optimizer)

    log = helpers.LogChecker(capfd)
    log.assert_matches("Updating group 0 optimizer")
    # Updating the optimizer's parameter shouldn't trigger a sync of the weights.
    log.assert_no_matches("copyWeightsToHost()")

    # Run the model to use the new optimizer.
    training_model((input, ))

    s0 = optimizer.state_dict()

    log = helpers.LogChecker(capfd)
    log.assert_matches("Writing optimiser state tensors from IPU to host")

    # Run the model to make the IPU state dirty.
    training_model((input, ))

    log = helpers.LogChecker(capfd)
    # No data transfer should happen.
    log.assert_no_matches("Writing")

    optimizer.load_state_dict(s0)

    # Run the model to trigger the transfers.
    training_model((input, ))

    log = helpers.LogChecker(capfd).createIterator()
    # Updating the optimizer's state should trigger a backup of the IPU weights first.
    log.findNext("copyWeightsToHost()")
    # Then the new state should be uploaded
    log.findNext("Writing optimiser state tensors from host to IPU memory")


def test_write_ipu_state_from_checkpoint():
    input = torch.ones(2)
    model = helpers.ModelWithWeights(lambda x: x, input.shape)
    optimizer = poptorch.optim.Adam(model.parameters(), lr=1.0)
    # Halve the LR after each training step
    scheduler = ExponentialLR(optimizer, 0.5)
    training_model = poptorch.trainingModel(model, optimizer=optimizer)

    # Compile and run the model
    training_model((input, ))
    # Step the scheduler for the next epoch
    # lr: 1.0 -> 0.5
    scheduler.step()
    # Set the new LR
    training_model.setOptimizer(optimizer)
    s1 = optimizer.state_dict()

    with tempfile.TemporaryDirectory() as d:
        path = os.path.join(d, "checkpoint.pt")
        # Save the state_dict to file
        torch.save({"optimizer_state_dict": s1}, path)
        # Load it back
        checkpoint = torch.load(path)

        # Create a new optimizer and load the checkpoint
        optimizer = poptorch.optim.Adam(model.parameters(), lr=0.1)
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

        s2 = optimizer.state_dict()
        # Ensure the new optimizer state matches the one saved
        for k, v in s1["ipu_state"].items():
            helpers.assert_allequal(actual=s2["ipu_state"][k], expected=v)
        for k, v in s1["ipu_param"].items():
            helpers.assert_allequal(actual=s2["ipu_param"][k], expected=v)

        # Now continue training to test that the updated LR is used
        scheduler = ExponentialLR(optimizer, 0.5)
        training_model.setOptimizer(optimizer)
        # New LR is set internally when the model is run
        training_model((input, ))
        s3 = optimizer.state_dict()
        torch_lr = torch.tensor(s3["param_groups"][0]["lr"])
        poptorch_lr = s3["ipu_param"]['learningRate___specific___lin.bias']
        # Ensure the torch LR parameter is correct
        helpers.assert_allclose(actual=torch_lr, expected=torch.tensor(0.5))
        # Ensure the internal LR parameter matches
        helpers.assert_allclose(actual=poptorch_lr, expected=torch_lr)


def test_setOptimizer_frozen_options_ok():
    input = torch.ones(2)
    opts = poptorch.Options()
    opts.Training.setMeanAccumulationAndReplicationReductionStrategy(
        poptorch.MeanReductionStrategy.Post)

    model = helpers.ModelWithWeights(lambda x: x, input.shape)
    # This will freeze the options
    data = poptorch.DataLoader(opts, [(torch.ones(2), )])

    optimizer = poptorch.optim.Adam(model.parameters(),
                                    lr=0.5,
                                    accum_type=torch.half)

    # will set the reduction strategy to Running
    training_model = poptorch.trainingModel(model,
                                            optimizer=optimizer,
                                            options=opts)
    training_model.compile(tuple(next(iter(data))))
    assert training_model.options.Training.meanAccumulationAndReplicationReductionStrategy == poptorch.MeanReductionStrategy.Running  # pylint: disable=line-too-long

    optimizer.param_groups[0]['lr'] = 0.01
    training_model.setOptimizer(optimizer)


def test_setOptimizer_frozen_options_broken():
    input = torch.ones(2)
    opts = poptorch.Options()
    opts.Training.setMeanAccumulationAndReplicationReductionStrategy(
        poptorch.MeanReductionStrategy.Post)

    model = helpers.ModelWithWeights(lambda x: x, input.shape)
    # This will freeze the options
    data = poptorch.DataLoader(opts, [(torch.ones(2), )])

    optimizer = poptorch.optim.Adam(model.parameters(), lr=0.5)

    # will set the reduction strategy to Running
    training_model = poptorch.trainingModel(model,
                                            optimizer=optimizer,
                                            options=opts)
    training_model.compile(tuple(next(iter(data))))
    assert training_model.options.Training.meanAccumulationAndReplicationReductionStrategy == poptorch.MeanReductionStrategy.Post  # pylint: disable=line-too-long

    optimizer.param_groups[0]['lr'] = 0.01
    optimizer.accum_type = torch.half
    with pytest.raises(ValueError, match="is already compiled"):
        training_model.setOptimizer(optimizer)


@helpers.printCapfdOnExit
@pytest.mark.parametrize("opt", all_optimizers)
@pytest.mark.parametrize("subclassed", [True, False])
def test_optimizer_warnings(capfd, opt, subclassed):
    # The NoBias classes defined in this file are subclasses and
    # should be warned about, so skip the 'not subclassed' case for these
    if 'NoBias' in str(opt) and not subclassed:
        pytest.skip()

    if subclassed:

        class SubclassedOpt(opt):
            pass

        opt = SubclassedOpt

    model = OptimizerTestModel()
    model.setOptimizer(opt(model.parameters(), lr=0.01))
    expected_warning = "Poptorch does not run Python optimizer code directly"
    testlog = helpers.LogChecker(capfd)
    if subclassed:
        testlog.assert_contains(expected_warning)
    else:
        testlog.assert_not_contains(expected_warning)


================================================
FILE: tests/options_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import copy
import unittest.mock
import tempfile
import os
import threading
import torch
import torch.nn as nn
import pytest
import helpers
import poptorch
from poptorch.enums import OutputMode, MeanReductionStrategy


def test_set_options():
    # pylint: disable=protected-access

    # Create our model.
    opts = poptorch.Options()
    opts.outputMode(poptorch.enums.OutputMode.All)
    # Just set a bunch of options and check they're successfully parsed.
    with tempfile.TemporaryDirectory() as tmp:
        opts.deviceIterations(1).setExecutionStrategy(
            poptorch.PipelinedExecution()).replicationFactor(1).logDir(
                tmp).enableSyntheticData(True).maxRepeatLogs(None)

    poptorch.poptorch_core._validateOptions(opts.toDict())


class TestSetOptionsFromEnvironment:
    """Checks that we can set options through environment variables"""

    def test_block(self):
        ref = poptorch.Options()
        opts = poptorch.Options()
        # Just set a bunch of options and check they're successfully parsed.
        with tempfile.TemporaryDirectory() as tmp:
            opts.deviceIterations(2).replicationFactor(1).logDir(
                tmp).enableSyntheticData(True)
            try:
                os.environ["POPTORCH_DEFAULT_OPTIONS"] = (
                    '{"deviceIterations":2,'
                    f'"replicationFactor":1,"logDir":"{tmp}",'
                    '"enableSyntheticData":true}')
                init_set = poptorch.Options()
            finally:
                del os.environ["POPTORCH_DEFAULT_OPTIONS"]

        assert f"{ref}" != f"{opts}"
        assert f"{opts}" == f"{init_set}"

    def test_dotted_access(self):
        ref = poptorch.Options()
        opts = poptorch.Options()
        opts.Precision.enableFloatingPointExceptions(True)
        try:
            os.environ["POPTORCH_DEFAULT_OPTIONS"] = (
                '{"Precision.enableFloatingPointExceptions":true}')
            init_set = poptorch.Options()
        finally:
            del os.environ["POPTORCH_DEFAULT_OPTIONS"]

        assert f"{ref}" != f"{init_set}"
        assert f"{opts}" == f"{init_set}"

    def test_enum_conversion(self):
        ref = poptorch.Options()
        opts = poptorch.Options()
        opts.connectionType(poptorch.ConnectionType.OnDemand)
        try:
            os.environ["POPTORCH_DEFAULT_OPTIONS"] = (
                '{"connectionType":"ConnectionType.OnDemand"}')
            init_set = poptorch.Options()
        finally:
            del os.environ["POPTORCH_DEFAULT_OPTIONS"]

        assert f"{ref}" != f"{init_set}"
        assert f"{opts}" == f"{init_set}"

    def test_setting_popart_options(self):
        ref = poptorch.Options()
        opts = poptorch.Options()
        # pylint: disable=protected-access
        opts._Popart.set("saveInitializersToFile", "my_file.onnx")
        try:
            os.environ["POPTORCH_DEFAULT_OPTIONS"] = (
                '{"_Popart.set":["saveInitializersToFile", "my_file.onnx"]}')
            init_set = poptorch.Options()
        finally:
            del os.environ["POPTORCH_DEFAULT_OPTIONS"]

        assert f"{ref}" != f"{init_set}"
        assert f"{opts}" == f"{init_set}"


@pytest.mark.parametrize("key, value, expected_str", [
    ("asdfasdf", True, r"Unknown .* option .*"),
    ("dotChecks", torch.empty(1, 1), r"Unknown value type .* for option .*"),
    ("asdfasdf", torch.empty(
        1, 1), r"(Unknown .* option .*|Unknown value type .* for option .*)"),
])
def test_invalid_options(key, value, expected_str):
    # pylint: disable=protected-access
    opts = poptorch.Options()
    opts.outputMode(poptorch.enums.OutputMode.All)

    opts._Popart.set(key, value)

    with pytest.raises(poptorch.Error, match=expected_str):
        poptorch.poptorch_core._validateOptions(opts.toDict())


@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("DEBUG")
def test_set_options_from_file(capfd):
    class LogChecker(helpers.LogChecker):
        def validate(self):
            # pylint: disable=line-too-long
            self.assert_contains(
                "poptorch.Options set replication_factor to value 1")
            self.assert_contains(
                "poptorch.Options set device_iterations to value 1")
            self.assert_contains(
                "poptorch.Options set execution_mode to value 1")
            self.assert_contains(
                "poptorch.Options set syntheticDataMode to value 2")

    class Network(nn.Module):
        def forward(self, x, y):
            return x + y

    options_list = [
        "deviceIterations(1)",
        "setExecutionStrategy(poptorch.ShardedExecution())",
        "  replicationFactor(1)",  # Whitespace should be stripped
        " ",  # Empty lines should be skipped
        "enableSyntheticData(True) # Inline comments should be ignored",
        "# Comments should be ignored"
    ]
    options_list = "\n".join(options_list)

    with tempfile.TemporaryDirectory() as tmp:
        filepath = os.path.join(tmp, "tmp.conf")
        f = open(filepath, "w")
        # Write the options to file
        f.write(options_list)
        f.close()

        opts = poptorch.Options()
        # Read the options back
        opts.loadFromFile(filepath)

        # Ensure that a useful error message is output on malformed input
        f = open(filepath, "a")
        f.write("\noutputMode(poptorch.OutputMode.All")
        f.close()
        with pytest.raises(poptorch.options.ConfigFileError) as e:
            opts.loadFromFile(filepath)
        assert "SyntaxError at line 5 of tmp.conf: unexpected EOF " \
               "while parsing\n" \
               "> options.outputMode(poptorch.OutputMode.All" in str(e.value)

    # Create the model
    model = Network()
    inference_model = poptorch.inferenceModel(model, opts)

    x = torch.ones(2)
    y = torch.zeros(2)

    # Run the model
    inference_model(x, y)

    testlog = LogChecker(capfd)
    # Ensure the options were actually set
    testlog.validate()


@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("DEBUG")
def test_override_options_from_file(capfd):
    class LogChecker(helpers.LogChecker):
        def validate(self):
            # pylint: disable=line-too-long
            self.assert_contains(
                "poptorch.Options set replication_factor to value 2")
            self.assert_contains(
                "poptorch.Options set device_iterations to value 1")
            self.assert_contains(
                "poptorch.Options set execution_mode to value 1")
            self.assert_contains(
                "poptorch.Options set syntheticDataMode to value 2")

    class Network(nn.Module):
        def forward(self, x, y):
            return x + y

    options_list = [
        "deviceIterations(2)",
        "setExecutionStrategy(poptorch.ShardedExecution())",
        "replicationFactor(2)",
        "enableSyntheticData(True)",
    ]

    options_list_override = [
        "deviceIterations(1)",
    ]

    options_list = "\n".join(options_list)
    options_list_override = "\n".join(options_list_override)

    with tempfile.TemporaryDirectory() as tmp:
        filepath = os.path.join(tmp, "tmp.conf")
        f = open(filepath, "w")
        # Write the options to file
        f.write(options_list)
        f.close()

        filepath_override = os.path.join(tmp, "tmp_override.conf")
        f = open(filepath_override, "w")
        # Write the options to file
        f.write(options_list_override)
        f.close()

        opts = poptorch.Options()
        # Read the options back
        opts.loadFromFile(filepath)
        # Read the override options
        opts.loadFromFile(filepath_override)

    # Create the model
    model = Network()
    inference_model = poptorch.inferenceModel(model, opts)

    assert inference_model.options.replication_factor == 2
    assert inference_model.options.device_iterations == 1

    x = torch.ones(2)
    y = torch.zeros(2)

    # Run the model
    inference_model(x, y)

    testlog = LogChecker(capfd)
    # Ensure the options were set correctly
    # The override should ONLY override options that were actually set
    testlog.validate()


@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("DEBUG")
def test_set_popart_options(capfd):
    # pylint: disable=protected-access

    opts = poptorch.Options()
    opts.outputMode(poptorch.enums.OutputMode.All)

    opts._Popart.set("hardwareInstrumentations", set([0, 1]))
    opts._Popart.set("dotChecks", ["FINAL", "ALL"])
    opts._Popart.set("engineOptions", {
        "debug.allowOutOfMemory": "true",
    })
    opts._Popart.set("reportOptions", {"reportOptA": "A", "reportOptB": "B"})
    opts._Popart.set("convolutionOptions", {"convOptA": "A", "convOptB": "B"})
    opts._Popart.set("matmulOptions", {"matOptA": "A", "matOptB": "B"})
    opts._Popart.set("lstmOptions", {"lstmOptA": "A", "lstmOptB": "B"})
    opts._Popart.set("gclOptions", {"gclOptA": "A", "gclOptB": "B"})
    opts._Popart.set("customCodelets", [])
    opts._Popart.set("autoRecomputation", 1)
    opts._Popart.set("enableOutlining", True)
    opts._Popart.set("batchSerializationSettings.factor", 1)
    opts._Popart.set("batchSerializationSettings.concatOnVirtualGraphChange",
                     True)
    opts._Popart.set("batchSerializationSettings.concatOnExecutionPhaseChange",
                     True)
    opts._Popart.set("batchSerializationSettings.concatOnPipelineStageChange",
                     True)
    opts._Popart.set("batchSerializationSettings.transformContext", 0)
    opts._Popart.set("batchSerializationSettings.method", 0)
    opts._Popart.set("batchSerializationSettings.batchSchedule", 1)

    opts._Popart.set("accumulateOuterFragmentSettings.schedule", 1)
    opts._Popart.set("accumulateOuterFragmentSettings.excludedVirtualGraphs",
                     ["0", "1"])
    opts._Popart.set("enableExplicitIR", True)
    opts._Popart.set(
        "automaticLossScalingSettings.gradientTensorTrackingMethod", 1)
    opts._Popart.set("updatableNamedBuffers", ["t1", "t2"])

    poptorch.poptorch_core._validateOptions(opts.toDict())

    log = helpers.LogChecker(capfd)

    log.assert_contains("poptorch.Options added 0 to hardwareInstrumentations")
    log.assert_contains("poptorch.Options added 1 to hardwareInstrumentations")
    log.assert_contains("poptorch.Options added FINAL to dotChecks")
    log.assert_contains("poptorch.Options added ALL to dotChecks")
    log.assert_contains(
        "poptorch.Options set engineOptions[debug.allowOutOfMemory] to true")
    log.assert_contains("poptorch.Options set reportOptions[reportOptA] to A")
    log.assert_contains("poptorch.Options set reportOptions[reportOptB] to B")
    log.assert_contains(
        "poptorch.Options set convolutionOptions[convOptA] to A")
    log.assert_contains(
        "poptorch.Options set convolutionOptions[convOptB] to B")
    log.assert_contains("poptorch.Options set matmulOptions[matOptA] to A")
    log.assert_contains("poptorch.Options set matmulOptions[matOptB] to B")
    log.assert_contains("poptorch.Options set lstmOptions[lstmOptA] to A")
    log.assert_contains("poptorch.Options set lstmOptions[lstmOptB] to B")
    log.assert_contains("poptorch.Options set gclOptions[gclOptA] to A")
    log.assert_contains("poptorch.Options set gclOptions[gclOptB] to B")
    log.assert_contains("poptorch.Options set autoRecomputation to value 1")
    log.assert_contains("poptorch.Options set enableOutlining to value true")
    log.assert_contains(
        "poptorch.Options set batchSerializationSettings.factor to value 1")
    log.assert_contains(
        "poptorch.Options set "
        "batchSerializationSettings.concatOnVirtualGraphChange to value true")
    log.assert_contains(
        "poptorch.Options set "
        "batchSerializationSettings.concatOnExecutionPhaseChange to value true"
    )
    log.assert_contains(
        "poptorch.Options set "
        "batchSerializationSettings.concatOnPipelineStageChange to value true")
    log.assert_contains(
        "poptorch.Options set "
        "batchSerializationSettings.transformContext to value 0")
    log.assert_contains(
        "poptorch.Options set batchSerializationSettings.method to value 0")
    log.assert_contains(
        "poptorch.Options set batchSerializationSettings.batchSchedule "
        "to value 1")
    log.assert_contains(
        "poptorch.Options set accumulateOuterFragmentSettings.schedule "
        "to value 1")
    log.assert_contains(
        "poptorch.Options added 0 to "
        "accumulateOuterFragmentSettings.excludedVirtualGraphs")
    log.assert_contains(
        "poptorch.Options added 1 to "
        "accumulateOuterFragmentSettings.excludedVirtualGraphs")
    log.assert_contains("poptorch.Options set enableExplicitIR to value true")
    log.assert_contains(
        "poptorch.Options set "
        "automaticLossScalingSettings.gradientTensorTrackingMethod to value 1")
    log.assert_contains("poptorch.Options added t1 to updatableNamedBuffers")
    log.assert_contains("poptorch.Options added t2 to updatableNamedBuffers")


def test_popart_patterns():
    # pylint: disable=protected-access

    # Create our model.
    opts = poptorch.Options()
    opts.outputMode(poptorch.enums.OutputMode.All)

    patterns = {"PadSum": True}
    opts._Popart.setPatterns(patterns, 0)

    poptorch.poptorch_core._validateOptions(opts.toDict())


@helpers.printCapfdOnExit
@pytest.mark.parametrize("dtype", [torch.half, torch.float])
@pytest.mark.parametrize("ptype", [torch.half, torch.float])
@helpers.overridePoptorchLogLevel("TRACE")
def test_popart_partials(capfd, dtype, ptype):
    # pylint: disable=protected-access
    torch.manual_seed(42)
    x = torch.randn((1, 16, 16), dtype=dtype)

    model = torch.nn.Sequential()
    model.add_module('lin', torch.nn.Linear(16, 16, dtype=dtype))
    model.add_module('conv', torch.nn.Conv1d(16, 16, 1))

    opts = poptorch.Options()
    opts.Precision.setPartialsType(ptype)
    poptorch_model = poptorch.inferenceModel(model, opts)
    poptorch_model(x)

    log = helpers.LogChecker(capfd)
    if ptype == torch.float:
        log.assert_contains(
            'poptorch.Options set partialsTypeMatMuls to value float')
        log.assert_contains(
            'poptorch.Options set convolutionOptions[partialsType] to float')
        log.assert_contains('"partialsType":"MatMulPartialsType::FLOAT"')
        log.assert_contains('"partialsType[0]":"float"')
    else:
        log.assert_contains(
            'poptorch.Options set partialsTypeMatMuls to value half')
        log.assert_contains(
            'poptorch.Options set convolutionOptions[partialsType] to half')
        log.assert_contains('"partialsType":"MatMulPartialsType::HALF"')
        log.assert_contains('"partialsType[0]":"half"')


@pytest.mark.parametrize("optim", [
    poptorch.optim.SGD,
    poptorch.optim.Adam,
    poptorch.optim.AdamW,
    poptorch.optim.RMSprop,
    poptorch.optim.LAMB,
])
@pytest.mark.parametrize("initial_ls", [1.0, 2.0])
def test_automatic_loss_scaling(optim, initial_ls):
    input = torch.ones(5)
    # Just a simple model with weights and a loss function
    model = helpers.ModelWithWeights(lambda x: x, input.shape)
    # Weights need to be in fp16, since fp32 gradients don't influence
    # the loss scaling factor
    model.half()
    opts = poptorch.Options()
    opts.Training.setAutomaticLossScaling(True)
    # Anchor the final loss scale to compare against the update factor in ipu_state
    opts.anchorTensor("ls_final", "finalLossScale", poptorch.OutputMode.Final)

    # The lr value doesn't matter here, we just want to check the loss scale is updated
    optimizer_args = {
        "params": model.parameters(),
        "lr": 0.0,
        "loss_scaling": initial_ls
    }
    if optim == poptorch.optim.SGD:
        optimizer_args["use_combined_accum"] = False

    optimizer = optim(**optimizer_args)
    training_model = poptorch.trainingModel(model, opts, optimizer)

    # Compile the model first, so that we can get the ipu_state before running the model
    training_model.compile((input, ))
    for _ in range(5):
        # Get the update factor before running the model. This is the value used to
        # compute ls_final
        ls_update_factor = optimizer.state_dict(
        )['ipu_state']['lossScaleUpdateFactor']
        training_model((input, ))
        ls_final = training_model.getAnchoredTensor("ls_final")

        # ls_final = ls_update_factor * initial_ls
        helpers.assert_allclose(actual=initial_ls * ls_update_factor,
                                expected=ls_final)


@pytest.mark.ipuHardwareRequired
def test_real_ipu_selection():
    class Network(nn.Module):
        def forward(self, x, y):
            return x + y

    model = Network()
    # Force-disable the IPU model
    opts = poptorch.Options().useIpuModel(False)
    inference_model = poptorch.inferenceModel(model, opts)
    x = torch.ones(2)
    y = torch.zeros(2)

    inference_model(x, y)


@pytest.mark.ipuHardwareRequired
def test_ipu_id_selection():
    class Network(nn.Module):
        def forward(self, x, y):
            return x + y

    model = Network()
    # Force-disable the IPU model
    opts = poptorch.Options().useIpuId(0)
    inference_model = poptorch.inferenceModel(model, opts)
    x = torch.ones(2)
    y = torch.zeros(2)

    inference_model(x, y)


@unittest.mock.patch.dict("os.environ", helpers.disableAllModels())
def test_offline_ipu():
    class Network(nn.Module):
        def forward(self, x, y):
            return x + y

    model = Network()
    # Force-disable the IPU model
    opts = poptorch.Options().useOfflineIpuTarget()

    inference_model = poptorch.inferenceModel(model, opts)
    x = torch.ones(2)
    y = torch.zeros(2)

    with pytest.raises(AssertionError,
                       match="Trying to run a model on an offline device"):
        inference_model(x, y)


@unittest.mock.patch.dict("os.environ", {})
def test_export_proto_file():
    class Network(nn.Module):
        def forward(self, x, y):
            return x + y

    with tempfile.TemporaryDirectory() as tmp:
        file = os.path.join(tmp, "my_dir", "my_model.proto")
        os.environ["POPTORCH_EXPORT_PROTO_FILE"] = file
        model = Network()
        inference_model = poptorch.inferenceModel(model)
        x = torch.ones(2)
        y = torch.zeros(2)

        inference_model(x, y)
        assert os.path.isfile(file)


def test_tensor_location():
    class Network(nn.Module):
        def forward(self, x, y):
            return x + y

    model = Network()
    opts = poptorch.Options()
    opts.TensorLocations.setActivationLocation(
        poptorch.TensorLocationSettings().minElementsForOffChip(
            4).useOnChipStorage(True))
    opts.TensorLocations.setWeightLocation(
        poptorch.TensorLocationSettings().useIOTilesToStore(
            True).useReplicatedTensorSharding(False))
    opts.TensorLocations.setOptimizerLocation(
        poptorch.TensorLocationSettings().useIOTilesToLoad(
            False).useReplicatedTensorSharding(
                True).minElementsForReplicatedTensorSharding(4))
    opts.TensorLocations.setAccumulatorLocation(
        poptorch.TensorLocationSettings().useOnChipStorage(False))
    inference_model = poptorch.inferenceModel(model, opts)
    x = torch.ones(2)
    y = torch.zeros(2)

    inference_model(x, y)


@helpers.printCapfdOnExit
@pytest.mark.parametrize("dtype", [torch.half, torch.float])
@helpers.overridePoptorchLogLevel("TRACE")
def test_running_statistics(capfd, dtype):
    x = torch.randn((16, 16), dtype=dtype)

    model = torch.nn.Sequential()
    model.add_module('lin', torch.nn.Linear(16, 16))
    model.add_module('bn', torch.nn.BatchNorm1d(16))

    if dtype == torch.half:
        model.half()

    poptorch_model = poptorch.inferenceModel(model)

    poptorch_model(x)

    log = helpers.LogChecker(capfd)
    dtype_str = "Float" if dtype == torch.float else "Half"
    device = "ipu:0"

    log.assert_contains(
        f" : {dtype_str}(16, strides=[1], requires_grad=0, device={device}) "
        "-> bn.running_var")


def test_copying_options():
    # pylint: disable=protected-access
    opts = poptorch.Options()
    locationOnChip = poptorch.TensorLocationSettings()
    locationOnChip.useOnChipStorage(True)
    locationOutsideChip = poptorch.TensorLocationSettings()
    locationOutsideChip.useOnChipStorage(False)

    opts.deviceIterations(5)
    opts.Distributed.configureProcessId(5, 15)
    opts.anchorTensor("t1", "tensor1", OutputMode.EveryN, 2)
    opts._Popart.set("autoRecomputation", 3)
    opts._Popart.set("dummyKey", 5)
    opts.Training.gradientAccumulation(4)
    opts.TensorLocations.setWeightLocation(locationOnChip)
    deep_copy = copy.deepcopy(opts)

    opts.deviceIterations(4)
    opts.Distributed.configureProcessId(2, 15)
    opts.anchorTensor("t2", "tensor2", OutputMode.Final)
    opts._Popart.set("autoRecomputation", 2)
    opts.TensorLocations.setWeightLocation(locationOutsideChip)

    assert opts.device_iterations != deep_copy.device_iterations
    assert opts.anchored_tensors != deep_copy.anchored_tensors
    assert opts.replication_factor == deep_copy.replication_factor
    assert opts.log_dir == deep_copy.log_dir
    assert opts.auto_round_num_ipus == deep_copy.auto_round_num_ipus
    assert opts.output_mode == deep_copy.output_mode
    assert opts.output_return_period == deep_copy.output_return_period
    assert opts.connection_type == deep_copy.connection_type
    assert opts.sync_pattern == deep_copy.sync_pattern
    assert (opts.available_memory_proportion ==
            deep_copy.available_memory_proportion)

    assert (opts.Distributed.distributed_process_id !=
            deep_copy.Distributed.distributed_process_id)
    assert (opts.Distributed.num_distributed_processes ==
            deep_copy.Distributed.num_distributed_processes)

    assert deep_copy.TensorLocations.location_weight["onChip"]
    assert not opts.TensorLocations.location_weight["onChip"]

    assert (opts._Popart.options["autoRecomputation"] !=
            deep_copy._Popart.options["autoRecomputation"])
    assert (opts._Popart.options["dummyKey"] ==
            deep_copy._Popart.options["dummyKey"])

    assert (opts.Training.gradient_accumulation ==
            deep_copy.Training.gradient_accumulation)


def test_preserving_options_intact():
    class ExampleModel(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.bias = torch.nn.Parameter(torch.zeros(()))

        def forward(self, x):
            return torch.cat([
                100 * torch.nn.LeakyReLU()(-x + self.bias),
                100 * torch.nn.LeakyReLU()(x - self.bias)
            ],
                             dim=-1)

    class ExampleModelWithLoss(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.model = ExampleModel()

        def forward(self, input, target):
            out = self.model(input)
            return (torch.nn.functional.softmax(out),
                    torch.nn.CrossEntropyLoss(reduction="mean")(out, target))

    model = ExampleModelWithLoss()
    opts = poptorch.Options()
    training = poptorch.trainingModel(model, opts)
    inference = poptorch.inferenceModel(model, opts)

    assert opts.defaultOutputMode()
    assert training.options.output_mode == OutputMode.Final
    assert inference.options.output_mode == OutputMode.All


@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("DEBUG")
@pytest.mark.parametrize("namescopes_enabled", [True, False])
def test_name_scope_hook_disabled(capfd, namescopes_enabled):
    class Network(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.layer1 = torch.nn.Sequential(torch.nn.Conv2d(1, 4, 5),
                                              torch.nn.MaxPool2d(2),
                                              torch.nn.ReLU())
            self.layer2 = torch.nn.Sequential(torch.nn.Linear(40, 10),
                                              torch.nn.ReLU())
            self.softmax = torch.nn.LogSoftmax(1)

        def forward(self, x):
            x = self.layer1(x)
            x = x.view(5, 40)
            x = self.layer2(x)
            x = self.softmax(x)
            return x

    model = Network()
    options = poptorch.Options()
    if not namescopes_enabled:
        options.disableModuleNamescope()
    poptorch_model = poptorch.inferenceModel(model, options)

    input = torch.randn(2, 1, 15, 15)
    _ = poptorch_model(input)

    ir = poptorch_model._debugGetPopartIR()  # pylint: disable=protected-access

    expected_namescopes = [
        'layer1/0/', 'layer1/1/', 'layer1/1/', 'layer2/0/', 'layer2/1/',
        'softmax'
    ]
    base_names = ['Conv', 'MaxPool', 'Relu', 'MatMul', 'Relu', 'LogSoftmax']
    assert len(expected_namescopes) == len(base_names)

    for i, name in enumerate(base_names):
        namescope = expected_namescopes[i] if namescopes_enabled else ''
        expected_output = f'"name":"{namescope}{name}'
        assert ir.find(expected_output)

    testlog = helpers.LogChecker(capfd)
    it = testlog.createIterator()
    it.findNext("lowered to PopART")
    # Ensure none of the scope names are actually lowered to PopART
    # They should have been handled by the compiler and not be part
    # of the graph anymore.
    it.assert_not_contains("Char")


def test_ipu_context_flag():
    class Network(nn.Module):
        def forward(self, x, y):
            if poptorch.isRunningOnIpu():
                output = x + y
            else:
                output = x * y

            return output

    model = Network()

    options = poptorch.Options()
    inference_model = poptorch.inferenceModel(model, options)

    x = torch.tensor([50])
    y = torch.tensor([2])

    assert inference_model(x, y) == 52
    assert model(x, y) == 100


@pytest.mark.ipuHardwareRequired
@pytest.mark.parametrize("enabled", [True, False, None])
@helpers.overridePoptorchLogLevel("INFO")
def test_ipu_model(enabled, capfd):
    class Model(nn.Module):
        def forward(self, x, y):
            return x + y

    model = Model()
    opts = poptorch.Options()
    if enabled is not None:
        opts.useIpuModel(enabled)

    poptorch_model = poptorch.inferenceModel(model, opts)
    x = torch.tensor([50])
    y = torch.tensor([2])

    poptorch_model(x, y)

    log = helpers.LogChecker(capfd)
    if enabled is None:
        log.assert_not_contains("From the user configuration: Ipu model")
    elif enabled:
        log.assert_contains("From the user configuration: Ipu model: Enabled")
    else:
        log.assert_contains("From the user configuration: Ipu model: Disabled")


@pytest.mark.ipuHardwareRequired
@helpers.overridePoptorchLogLevel("DEBUG")
def test_log_cycle_count(capfd):
    class LogChecker(helpers.LogChecker):
        def validate(self):
            self.assert_contains("Total number of IPU cycles: ")

    class Network(nn.Module):
        def forward(self, x, y):
            return x + y

    opts = poptorch.Options().logCycleCount(True)
    inference_model = poptorch.inferenceModel(Network(), opts)

    x = torch.tensor([1])
    y = torch.tensor([2])

    inference_model(x, y)

    assert inference_model.cycleCount() > 0

    log = LogChecker(capfd)
    log.validate()


def test_profile_report_with_model_name():
    def test(dirname):
        model = torch.nn.Linear(100, 100)
        opts = poptorch.Options()
        opts.modelName("tommyflowers")
        opts.enableProfiling(dirname)

        poptorch_model = poptorch.inferenceModel(model, opts)
        x = torch.randn(100, 100)
        poptorch_model(x)

    dirname = tempfile.mkdtemp()
    x = threading.Thread(target=test, args=(dirname, ))
    x.start()
    x.join()

    assert os.path.exists(os.path.join(dirname, "tommyflowers", "profile.pop"))


def test_profile_report():
    def test(dirname):
        model = torch.nn.Linear(100, 100)
        opts = poptorch.Options()
        opts.enableProfiling(dirname)

        poptorch_model = poptorch.inferenceModel(model, opts)
        x = torch.randn(100, 100)
        poptorch_model(x)

    dirname = tempfile.mkdtemp()
    x = threading.Thread(target=test, args=(dirname, ))
    x.start()
    x.join()

    assert os.path.exists(os.path.join(dirname, "inference", "profile.pop"))


mean_reduction_strategy_params = [
    # accum_type, training, combined_accum, correct_strategy

    # Post should be the float32 default
    (torch.float32, True, False, MeanReductionStrategy.Post),
    # Running should be the float16 default
    (torch.float16, True, False, MeanReductionStrategy.Running),
    # Running is not supported for combined_accum, so Post should be used
    (torch.float16, True, True, MeanReductionStrategy.Post),
    # The default accum_type is float32 so strategy should be Post when this is None
    (None, True, False, MeanReductionStrategy.Post),
    # The option isn't used in inference so it should remain as Post by default
    (None, False, False, MeanReductionStrategy.Post),
]


@pytest.mark.parametrize(
    "accum_type, training, combined_accum, correct_strategy",
    mean_reduction_strategy_params)
def test_mean_reduction_strategy_implicit(accum_type, training, combined_accum,
                                          correct_strategy):
    t1 = torch.tensor([1.])
    t2 = torch.tensor([2.])

    # A simple adder model just to test the correct strategy is set
    model = helpers.ModelWithWeights(lambda x, y: x + y, t1.shape)
    options = poptorch.Options()
    optimizer = poptorch.optim.SGD(model.parameters(),
                                   lr=0.01,
                                   accum_type=accum_type,
                                   use_combined_accum=combined_accum)

    poptorch_model = poptorch.trainingModel(
        model, options, optimizer) if training else poptorch.inferenceModel(
            model, options)

    poptorch_model.compile((t1, t2))

    assert (getattr(
        poptorch_model.options.Training,
        "meanAccumulationAndReplicationReductionStrategy") == correct_strategy)


def test_mean_reduction_strategy_explicit():
    t1 = torch.tensor([1.])
    t2 = torch.tensor([2.])

    # A simple adder model just to test the correct strategy is set
    model = helpers.ModelWithWeights(lambda x, y: x + y, t1.shape)

    options = poptorch.Options()
    options.Training.setMeanAccumulationAndReplicationReductionStrategy(
        MeanReductionStrategy.Running)
    poptorch_model = poptorch.trainingModel(model, options)

    poptorch_model.compile((t1, t2))

    assert (getattr(options.Training,
                    "meanAccumulationAndReplicationReductionStrategy") ==
            MeanReductionStrategy.Running)


def test_num_io_tiles():
    options = poptorch.Options()

    error_msg = "numIOTiles must be an even number between 32 and 192."
    with pytest.raises(AssertionError, match=error_msg):
        options.TensorLocations.numIOTiles(10)
    with pytest.raises(AssertionError, match=error_msg):
        options.TensorLocations.numIOTiles(193)
    with pytest.raises(AssertionError, match=error_msg):
        options.TensorLocations.numIOTiles(33)

    options.TensorLocations.numIOTiles(32)
    options.TensorLocations.numIOTiles(192)
    options.TensorLocations.numIOTiles(100)


# pylint: disable=protected-access
def test_options_change_after_use():
    model = helpers.ModelWithWeights(torch.nn.Linear(10, 10),
                                     torch.Size((5, 10)),
                                     loss_fn=torch.nn.CrossEntropyLoss())

    opts = poptorch.Options()
    poptorch_model = poptorch.trainingModel(model, options=opts)

    with pytest.raises(Exception):
        opts.randomSeed(42)
    with pytest.raises(Exception):
        poptorch_model.options.set(random_seed=42)
    with pytest.raises(Exception):
        opts.Training.gradientAccumulation(0)
    with pytest.raises(Exception):
        popart_opts = opts._Popart
        opts._Popart.set("groupNormStridedChannelGrouping", True)

    opts = poptorch.Options()
    features = torch.randn([100, 1, 128, 128])
    labels = torch.empty([100], dtype=torch.long).random_(10)
    dataset = torch.utils.data.TensorDataset(features, labels)

    poptorch_data_loader = poptorch.DataLoader(
        opts,
        dataset=dataset,
    )

    with pytest.raises(Exception):
        opts.randomSeed(42)
    with pytest.raises(Exception):
        poptorch_data_loader.options.set(random_seed=42)
    with pytest.raises(Exception):
        poptorch_data_loader.options.Training.gradientAccumulation(0)
    with pytest.raises(Exception):
        popart_opts = poptorch_data_loader.options._Popart
        popart_opts.set("groupNormStridedChannelGrouping", True)


def test_copied_options_unfrozen():
    opts = poptorch.Options()
    # Freeze the opts.
    _ = poptorch.DataLoader(
        opts,
        dataset=torch.utils.data.TensorDataset(
            torch.randn([100, 1, 128, 128]),
            torch.empty([100], dtype=torch.long).random_(10),
        ),
    )
    copied_opts = copy.deepcopy(opts)

    # Make sure that no 'Can't modify frozen Options' errors are raised.
    copied_opts.deviceIterations(5)
    copied_opts.Distributed.configureProcessId(5, 15)
    copied_opts._Popart.set("autoRecomputation", 3)
    copied_opts.Training.gradientAccumulation(4)
    copied_opts.TensorLocations.setWeightLocation(
        poptorch.TensorLocationSettings().useIOTilesToStore(True))
    copied_opts.Precision.setPartialsType(torch.float16)


def test_wrap_options():
    """Popdist wraps poptorch Options using something similar"""

    class _Distributed(poptorch.options._DistributedOptions):
        pass

    opts = poptorch.Options()
    opts.Distributed.__class__ = _Distributed


def test_options_printing(capsys):
    """Check that the Options class displays meaningful information"""
    opts = poptorch.Options()
    opts.replicationFactor(4)
    print(opts)
    captured = capsys.readouterr()
    id_string = f"{id(opts):x}"  # Default printing is hexadecimal ID of object
    error_str = "The Options class should be printing meaningful informations"
    assert id_string not in captured.out, error_str
    assert captured.out.startswith("Options(")
    assert "replication_factor=4" in captured.out


================================================
FILE: tests/other_ops_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import json
import re

import torch
import pytest
import helpers
import poptorch

torch.manual_seed(42)
params_einsum = [
    ('i->', (torch.randn(5), )),
    ('ij->i', (torch.randn(5, 4), )),
    ('i,j->j', (torch.randn(5), torch.randn(4))),
    ('i,j->ji', (torch.randn(5), torch.randn(4))),
    ('bij,bjk->bik', (torch.randn(3, 2, 5), torch.randn(3, 5, 4))),
    ('bn,anm,bm->ba', (torch.randn(2, 5), torch.randn(3, 5,
                                                      4), torch.randn(2, 4))),
    ('bfnd,ndh->bfh', (torch.randn(2, 3, 4, 5), torch.randn(4, 5, 6))),
    ('nmku,buvm->bnkv', (torch.randn(2, 3, 4, 5), torch.randn(6, 5, 7, 3))),
]


def default_assert_fn(native_out, poptorch_out):
    if isinstance(native_out, tuple):
        for native, pop in zip(native_out, poptorch_out):
            helpers.assert_allclose(expected=native, actual=pop)
    else:
        helpers.assert_allclose(expected=native_out, actual=poptorch_out)


def op_harness(op, *inputs, assert_fn=None, out_fn=None):
    model = helpers.ModelWithWeights(op, inputs[0].shape, out_fn=out_fn)
    poptorch_model = poptorch.trainingModel(model)

    # Run on CPU
    native_out, _ = model(inputs)

    # Run on IPU
    poptorch_out, _ = poptorch_model(inputs)

    if assert_fn is None:
        assert_fn = default_assert_fn

    # Inference test - check outputs
    assert_fn(native_out, poptorch_out)

    # Training test - check weights changed
    poptorch_model.assert_weights_changed()

    return model, poptorch_model


def op_harness_inference(model, *inputs):
    poptorch_model = poptorch.inferenceModel(model)
    native_out = model(*inputs)
    poptorch_out = poptorch_model(*inputs)

    default_assert_fn(native_out, poptorch_out)


@pytest.mark.parametrize("params", params_einsum)
@pytest.mark.parametrize("implicit_rhs", {True, False})
def test_einsum(params, implicit_rhs):

    eq = params[0].split('->')[0] if implicit_rhs else params[0]

    op = lambda *xs: torch.einsum(eq, *xs)
    op_harness(op, *params[1])


def test_einsum_chained():
    torch.manual_seed(42)

    def op(x, y, z):
        r = torch.einsum('b u k m, b u v m -> b k v', x, y)
        return torch.einsum('b h k n, b k v -> b h v n', z, r)

    inputs = [torch.randn(1, 4, 16, 4, dtype=torch.float) for _ in range(3)]

    def assert_fn(native_out, poptorch_out):
        helpers.assert_allclose(expected=native_out,
                                actual=poptorch_out,
                                rtol=1e-3,
                                atol=1e-3)

    op_harness(op, *inputs, assert_fn=assert_fn)


def test_einsum_transpose():
    torch.manual_seed(42)

    def op(x):
        return torch.einsum('n c h w -> n h w c', x)

    inputs = [torch.randn(2, 3, 4, 5, dtype=torch.float)]

    def assert_fn(native_out, poptorch_out):
        helpers.assert_allclose(expected=native_out,
                                actual=poptorch_out,
                                rtol=1e-3,
                                atol=1e-3)

    op_harness(op, *inputs, assert_fn=assert_fn)


@pytest.mark.parametrize("arr_lengths",
                         ([3], [3, 3], [2, 4], [3, 2, 4], [5, 2, 3, 4]))
def test_meshgrid(arr_lengths):
    torch.manual_seed(42)

    inputs = [torch.randn(arr_length) for arr_length in arr_lengths]

    op_harness(torch.meshgrid, *inputs, out_fn=lambda x: x[0])


@pytest.mark.parametrize("arr_lengths",
                         ([3], [3, 3], [2, 4], [3, 2, 4], [5, 2, 3, 4]))
def test_cartesian_prod(arr_lengths):
    torch.manual_seed(42)

    inputs = [torch.randn(arr_length) for arr_length in arr_lengths]

    op_harness(torch.cartesian_prod, *inputs)


@pytest.mark.parametrize("dims", (2, ([2], [0]), ([2, 3], [0, 1])))
def test_tensordot(dims):
    torch.manual_seed(42)

    op = lambda a, b: torch.tensordot(a, b, dims)

    x = torch.randn(2, 3, 5, 4)
    y = torch.randn(5, 4, 1)

    op_harness(op, x, y)


@pytest.mark.parametrize("inplace", [True, False])
@pytest.mark.parametrize("dim", range(-3, 3))
def test_scatter_add(inplace, dim):
    class Model(torch.nn.Module):
        def __init__(self, dim, dim_size):
            super().__init__()
            self.dim = dim
            self.dim_size = dim_size
            self.inplace = inplace

        def forward(self, src, index):
            sz = list(src.shape)
            sz[self.dim] = self.dim_size
            out = torch.ones(sz)

            if self.inplace:
                return out.scatter_add_(self.dim, index, src)

            return out.scatter_add(self.dim, index, src)

    torch.manual_seed(42)
    x = torch.randn(4, 8, 16)
    dim_size = x.shape[dim] // 2
    index = torch.randint_like(x, high=dim_size).long()

    op_harness(Model(dim, dim_size), x, index)


@pytest.mark.parametrize("dim", range(-3, 3))
@pytest.mark.parametrize("reduce", ['sum', 'amin', 'amax', 'mean', 'prod'])
@pytest.mark.parametrize("include_self", [True, False])
def test_scatter_reduce(dim, reduce, include_self):
    class Model(torch.nn.Module):
        def __init__(self, dim, reduce, include_self):
            super().__init__()
            self.dim = dim
            self.reduce = reduce
            self.include_self = include_self

        def forward(self, inp, index, src):
            output = inp.scatter_reduce(self.dim,
                                        index,
                                        src,
                                        reduce=self.reduce,
                                        include_self=self.include_self)
            return output

    torch.manual_seed(42)
    src = torch.randn(4, 8, 16)
    dim_size = src.shape[dim] // 2
    sz = list(src.shape)
    sz[dim] = dim_size
    inp = torch.randn(sz)
    index = torch.randint_like(src, high=dim_size).long()
    op_harness(Model(dim, reduce, include_self), inp, index, src)


@pytest.mark.parametrize("reduce", ['sum', 'amin', 'amax', 'mean', 'prod'])
@pytest.mark.parametrize("include_self", [True, False])
def test_scatter_reduce_fusable(reduce, include_self):
    dim = 0
    torch.manual_seed(42)

    class Model(torch.nn.Module):
        def __init__(self, dim, reduce, include_self):
            super().__init__()
            self.dim = dim
            self.reduce = reduce
            self.include_self = include_self

        def forward(self, inp, index, src):
            output = []
            for i in range(3):
                output.append(
                    inp.scatter_reduce(self.dim,
                                       index,
                                       src[i],
                                       reduce=self.reduce,
                                       include_self=self.include_self))
            return torch.cat(output, dim=1)

    src = [torch.randn(8, 16) for i in range(3)]
    inp = torch.randn(torch.Size([4, 16]))
    index = torch.randint_like(src[0], high=4).long()
    _, poptorch_model = op_harness(Model(dim, reduce, include_self), inp,
                                   index, src)

    all_ops = json.loads(poptorch_model._debugGetPopartIR())['maingraph']  # pylint: disable=protected-access
    scatter_reduce_ops = [
        op for op in all_ops if op['type'] == 'ScatterReduce'
    ]

    expected_scatter_len = 1 if include_self else 2
    assert len(scatter_reduce_ops) == expected_scatter_len
    expected_group_size = 6 if reduce == 'mean' else 3
    assert int(scatter_reduce_ops[0]['attributes']
               ['group_size']) == expected_group_size


@pytest.mark.parametrize("reduce", ['sum', 'amin', 'amax', 'mean', 'prod'])
@pytest.mark.parametrize("include_self", [True, False])
def test_scatter_reduce_should_not_apply_grouped_fuse(reduce, include_self):
    dim = 0
    torch.manual_seed(42)
    num_scatters = 3

    class Model(torch.nn.Module):
        def __init__(self, dim, reduce, include_self, num_scatters):
            super().__init__()
            self.dim = dim
            self.reduce = reduce
            self.include_self = include_self
            self.num_scatters = num_scatters

        def forward(self, inp, index, src):
            output = []
            for i in range(self.num_scatters):
                output.append(inp[i].scatter_reduce(
                    self.dim,
                    index,
                    src[i],
                    reduce=self.reduce,
                    include_self=self.include_self))
            return torch.cat(output, dim=1)

    src = [torch.randn(8, 16 + i) for i in range(num_scatters)]
    inp = [torch.randn(torch.Size([8, 16 + i])) for i in range(num_scatters)]
    index = torch.randint(low=0, high=8, size=[8, 1]).long()
    model = Model(dim, reduce, include_self, num_scatters)
    poptorch_model = poptorch.inferenceModel(model)
    poptorch_model.compile(inp, index, src)

    default_assert_fn(model(inp, index, src), poptorch_model(inp, index, src))

    all_ops = json.loads(poptorch_model._debugGetPopartIR())['maingraph']  # pylint: disable=protected-access
    scatter_reduce_ops = [
        op for op in all_ops if op['type'] == 'ScatterReduce'
    ]

    assert len(scatter_reduce_ops) >= 3


@pytest.mark.parametrize("dim", range(-3, 3))
@pytest.mark.parametrize("reduce", ["mean", "amax", "amin", "prod"])
@pytest.mark.parametrize("include_self", [True, False])
def test_index_reduce(dim, reduce, include_self):
    class Model(torch.nn.Module):
        def __init__(self, dim, reduce, include_self):
            super().__init__()
            self.dim = dim
            self.reduce = reduce
            self.include_self = include_self

        def forward(self, inp, index, src):
            output = inp.index_reduce_(self.dim,
                                       index,
                                       src,
                                       reduce=self.reduce,
                                       include_self=self.include_self)
            return output

    torch.manual_seed(17)
    inp = torch.randn(5, 8, 11)
    dim_size = inp.shape[dim] // 2
    sz = list(inp.shape)
    sz[dim] = dim_size
    src = torch.randn(sz)
    index = torch.randint(high=dim_size, size=(dim_size, )).long()
    op_harness(Model(dim, reduce, include_self), inp, index, src)


@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("TRACE")
@pytest.mark.parametrize("reduce", ['sum', 'amin', 'amax', 'mean'])
@pytest.mark.parametrize("expand_as", [True, False])
@pytest.mark.parametrize("include_self", [True, False])
def test_2d_scatter_reduce_with_index_expansion(capfd, reduce, expand_as,
                                                include_self):
    class Model(torch.nn.Module):
        def __init__(self, reduce, include_self):
            super().__init__()
            self.reduce = reduce
            self.include_self = include_self

        def forward(self, inp, index, src):
            if expand_as:
                index = index.expand_as(src)
            else:
                index = index.expand(src.shape)
            output = inp.scatter_reduce(-2,
                                        index,
                                        src,
                                        reduce=self.reduce,
                                        include_self=self.include_self)
            return output

    model = Model(reduce, include_self)
    poptorch_model = poptorch.inferenceModel(model)

    torch.manual_seed(0)
    index = torch.randint(0, 5, (6, 1), dtype=torch.long)
    src = torch.rand((6, 3))
    inp = torch.randn((5, 3))
    out = model(inp, index, src)
    poptorch_out = poptorch_model(inp, index, src)
    helpers.assert_allclose(actual=poptorch_out, expected=out)

    # Make sure the expand op is removed.
    look_for = "aten::expand_as" if expand_as else "aten::expand"
    log = helpers.LogChecker(capfd)
    it = log.createIterator()
    it.findNext("Removing index expansion node:")
    it.assert_not_contains(look_for)


@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("TRACE")
@pytest.mark.parametrize("expand_as", [True, False])
def test_2d_scatter_add_with_index_expansion(capfd, expand_as):
    class Model(torch.nn.Module):
        def forward(self, index, src):
            if expand_as:
                index = index.expand_as(src)
            else:
                index = index.expand(src.shape)
            return torch.zeros((5, 3)).scatter_add_(
                dim=-2,
                index=index,
                src=src,
            )

    model = Model()
    poptorch_model = poptorch.inferenceModel(model)

    torch.manual_seed(0)
    index = torch.randint(0, 5, (6, 1), dtype=torch.long)
    src = torch.rand((6, 3))

    out = model(index, src)
    poptorch_out = poptorch_model(index, src)
    helpers.assert_allclose(actual=poptorch_out, expected=out)

    # Make sure the expand op is removed.
    look_for = "aten::expand_as" if expand_as else "aten::expand"
    log = helpers.LogChecker(capfd)
    it = log.createIterator()
    it.findNext("Removing index expansion node:")
    it.assert_not_contains(look_for)


@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("TRACE")
@pytest.mark.parametrize("expand_as", [True, False])
@pytest.mark.parametrize("params", [
    {
        "shape": (3, 5),
        "gather_dim": 0,
        "expand_dim": 0,
        "should_optimise": False
    },
    {
        "shape": (3, 5),
        "gather_dim": 0,
        "expand_dim": 1,
        "should_optimise": True
    },
    {
        "shape": (3, 5),
        "gather_dim": 1,
        "expand_dim": 0,
        "should_optimise": True
    },
    {
        "shape": (3, 5),
        "gather_dim": 1,
        "expand_dim": 1,
        "should_optimise": False
    },
    {
        "shape": (1, 1, 3, 1, 5, 1),
        "gather_dim": 3,
        "expand_dim": 2,
        "should_optimise": False
    },
    {
        "shape": (1, 1, 3, 1, 5, 1),
        "gather_dim": 2,
        "expand_dim": 4,
        "should_optimise": True
    },
    {
        "shape": (1, 1, 3, 1, 5, 1),
        "gather_dim": 4,
        "expand_dim": 2,
        "should_optimise": True
    },
    {
        "shape": (1, 1, 3, 1, 5, 1),
        "gather_dim": 4,
        "expand_dim": 1,
        "should_optimise": False
    },
    {
        "shape": (3, 4, 5),
        "gather_dim": 0,
        "expand_dim": 1,
        "should_optimise": False
    },
])
def test_gather_with_index_expansion(capfd, expand_as, params):
    # Work out params to model.
    torch.manual_seed(42)

    data = torch.randint(10, params["shape"], dtype=torch.int)

    indices_shape = list(data.shape)
    indices_shape[params["expand_dim"]] = 1
    indices = torch.randint(high=data.shape[params["gather_dim"]],
                            size=indices_shape)

    # Make model.
    class Model(torch.nn.Module):
        def forward(self, data, indices):
            if expand_as:
                indices = indices.expand_as(data)
            else:
                indices = indices.expand(data.shape)

            # Also do an `add`, to check we can pipe the results onward.
            return torch.gather(data, params["gather_dim"], indices).add(8)

    model = Model()
    poptorch_model = poptorch.inferenceModel(model)

    # Run model, check result is still correct.
    cpu_out = model(data, indices)
    ipu_out = poptorch_model(data, indices)
    helpers.assert_allclose(actual=ipu_out, expected=cpu_out)

    log = helpers.LogChecker(capfd)
    it = log.createIterator()

    # Look for the log saying we did the optimisation, only if we should have.
    if params["should_optimise"]:
        it.findNext("Optimising gather:")

    # Look for the (non-)presence of the expand op that should be removed.
    # Note: aten::expand_as might be intercepted as aten::expand by the dispatcher
    # so only check for "expand"
    remove_if_optimised = "aten::expand"

    if params["should_optimise"]:
        it.assert_not_contains(remove_if_optimised)
    else:
        it.findNext(remove_if_optimised)


@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("TRACE")
def test_available_memory_scatter_add(capfd):
    class Model(torch.nn.Module):
        def __init__(self, dim, dim_size):
            super().__init__()
            self.dim = dim
            self.dim_size = dim_size

        def forward(self, src, index):
            sz = list(src.shape)
            sz[self.dim] = self.dim_size
            out = torch.ones(sz)
            sa = out.scatter_add(self.dim, index, src)
            am = poptorch.set_available_memory(sa, 0.9)
            return am

    dim = 2
    torch.manual_seed(42)
    x = torch.randn(4, 8, 16)
    dim_size = x.shape[dim] // 2
    index = torch.randint_like(x, high=dim_size).long()

    model = Model(dim, dim_size)
    poptorch_model = poptorch.inferenceModel(model)
    poptorch_model(x, index)

    log = helpers.LogChecker(capfd)
    it = log.createIterator()
    it.findNext("Graph before lowering to PopART:")
    # Assert that the set_available_memory node references the scatterreduce,
    # not the add.
    sa_line = it.findNext("popart::scatterreduce").strip()
    sa_var = sa_line.partition(" ")[0]
    sam_line = it.findNext("poptorch::set_available_memory").strip()
    # Check we have set_available_memory[...](%XX) where XX is the result of scatterreduce
    assert re.search(r"set_available_memory\[.*\]\(\{}\)".format(sa_var),
                     sam_line)


basic_test_data = [
    ([[3, 6, 9], [3, 6, 10], [-1, 0, 1], [8, 9, 140]], [1, 3, 5, 7, 9]),
    ([[2, 5, 10], [6, 8, 3]], [1, 5, 7, 8, 10]),
    (1, [1, 5, 7, 8, 10]),
    ([1, 2, 3, 4, 5, 6, 7, 8, 9], [1, 2, 3, 4, 5, 6]),
    ([[[1, 3, 5], [2, 4, 6]], [[1, 2, 3], [4, 5, 6]]], [1, 2, 3, 4, 5, 6]),
    ([1, 2, 3, 4, 5, 6, 7, 8, 9], [1, 2, 3, 6, 4, 5]),
]


def bucketize_op_test_body(right, data, dtypes):
    input_data, boundaries_data = data
    input_dtype, boundaries_dtype = dtypes
    input = torch.tensor(input_data, dtype=input_dtype)
    boundaries = torch.tensor(boundaries_data, dtype=boundaries_dtype)

    class Model(torch.nn.Module):
        def __init__(self, right):
            super().__init__()
            self.right = right

        def forward(self, input, boundaries):
            return torch.bucketize(input, boundaries, right=self.right)

    op_harness_inference(Model(right), input, boundaries)


@pytest.mark.parametrize("right", [True, False])
@pytest.mark.parametrize("dtypes", [(torch.float32, torch.float32),
                                    (torch.float32, torch.float32),
                                    (torch.int32, torch.float32),
                                    (torch.float32, torch.int32)])
@pytest.mark.parametrize("data", basic_test_data)
def test_bucketize_basic(right, data, dtypes):
    bucketize_op_test_body(right, data, dtypes)


fp_test_data = [
    ([1, 2, 3, 4, 5, 6, 7, 8, 9], [0.9, 1, 2, 2, 3, 3, 4, 4.1, 9, 9]),
    (
        [[[1, 3, 5], [2, 4, 6]], [[1, 2, 3], [4, 5, 6]]],
        [0.9, 1, 2, 2, 3, 3, 4, 4.1, 9, 9],
    ),
]


@pytest.mark.parametrize("right", [True, False])
@pytest.mark.parametrize("data", basic_test_data)
def test_bucketize_fp(right, data):
    bucketize_op_test_body(right, data, (torch.float32, torch.int32))


@pytest.mark.parametrize("out_int32", [True, False])
def test_bucketize_inplace(out_int32):
    input = torch.tensor([[2, 5, 10], [6, 8, 3]], dtype=torch.int32)
    boundaries = torch.tensor([1, 5, 7, 8, 10], dtype=torch.int32)
    out_dtype = torch.int32 if out_int32 else torch.int64
    out_poptorch = torch.zeros(2, 3, dtype=out_dtype)
    out_native = out_poptorch.clone()

    class Model(torch.nn.Module):
        def forward(self, input, boundaries, out):
            return torch.bucketize(input,
                                   boundaries,
                                   out_int32=out_int32,
                                   out=out)

    model = Model()
    returned_out_native = model(input, boundaries, out_native)
    poptorch_model = poptorch.inferenceModel(model)
    returned_out_poptorch = poptorch_model(input, boundaries, out_poptorch)

    default_assert_fn(returned_out_poptorch, out_poptorch)
    default_assert_fn(out_native, out_poptorch)
    default_assert_fn(returned_out_native, returned_out_poptorch)


================================================
FILE: tests/outputs_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import torch
import torch.nn as nn
import helpers
import poptorch


def test_multiple_tensors():
    class Network(nn.Module):
        def forward(self, x, y):

            t1 = (x + y)
            t2 = (t1, x * y)

            return t2[0], y - x, t2[1] + t1

    # Create our model.
    model = Network()
    inference_model = poptorch.inferenceModel(model)

    x = torch.ones(2)
    y = torch.zeros(2)

    ipu = inference_model(x, y)
    ref = model(x, y)
    helpers.assert_allclose(actual=ipu, expected=ref)


def test_simple_list():
    class Network(nn.Module):
        def forward(self, x, y):

            t1 = (x + y)
            t2 = (t1, x * y)

            return [t2[0], y - x, t2[1] + t1]

    # Create our model.
    model = Network()
    inference_model = poptorch.inferenceModel(model)

    x = torch.ones(2)
    y = torch.zeros(2)

    ipu = inference_model(x, y)
    ref = model(x, y)
    helpers.assert_allclose(actual=ipu, expected=ref)


def test_simple_tuple():
    class Network(nn.Module):
        def forward(self, x, y):

            t1 = (x + y)
            t2 = (t1, x * y)

            return (t2[0], y - x, t2[1] + t1)

    # Create our model.
    model = Network()
    inference_model = poptorch.inferenceModel(model)

    x = torch.ones(2)
    y = torch.zeros(2)

    ipu = inference_model(x, y)
    ref = model(x, y)
    helpers.assert_allclose(actual=ipu, expected=ref)


def test_nested_tuples():
    class Network(nn.Module):
        def forward(self, x, y):

            t1 = (x + y)
            t2 = (t1, x * y)

            return x, (t2, y - x, t2[1] + t1), (y, ((t1 * 2.0)))

    # Create our model.
    model = Network()
    inference_model = poptorch.inferenceModel(model)

    x = torch.ones(2)
    y = torch.zeros(2)

    ipu = inference_model(x, y)
    ref = model(x, y)

    helpers.assert_allclose(actual=ipu, expected=ref)


def test_same_tensor():
    class Network(nn.Module):
        def forward(self, x, y):

            t1 = (x + y)
            t2 = (t1, x * y)

            return t1, (t1, t2, t1)

    # Create our model.
    model = Network()
    inference_model = poptorch.inferenceModel(model)

    x = torch.ones(2)
    y = torch.zeros(2)

    ipu = inference_model(x, y)
    ref = model(x, y)

    helpers.assert_allclose(actual=ipu, expected=ref)


def test_dict():
    class Network(nn.Module):
        def forward(self, x, y):

            t1 = (x + y)
            t2 = (x * y)

            # Note: keys are not in alphabetical order
            return {'b': t1, 'a': t2}

    # Create our model.
    cpu_model = Network()
    ipu_model = poptorch.inferenceModel(cpu_model)

    x = torch.ones(2)
    y = torch.zeros(2)

    cpu_res = cpu_model(x, y)
    ipu_res = ipu_model(x, y)

    # Check the outputs are the same
    assert cpu_res.keys() == ipu_res.keys()
    for k in cpu_res.keys():
        assert torch.allclose(cpu_res[k], ipu_res[k])


================================================
FILE: tests/overlapped_io_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.

import torch
import pytest

import poptorch

INPUT_SIZE = 64


def get_model(num_mat_muls,
              input_a_overlap=poptorch.OverlapMode.NoOverlap,
              input_b_overlap=poptorch.OverlapMode.NoOverlap,
              loss_overlap=poptorch.OverlapMode.NoOverlap,
              sum_all_overlap=poptorch.OverlapMode.NoOverlap):
    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()

            for idx in range(num_mat_muls):
                self.register_parameter(
                    "a" + str(idx),
                    torch.nn.Parameter(
                        torch.randn([1, INPUT_SIZE, INPUT_SIZE],
                                    dtype=torch.float32)))
                self.register_parameter(
                    "b" + str(idx),
                    torch.nn.Parameter(
                        torch.randn([1, INPUT_SIZE, INPUT_SIZE],
                                    dtype=torch.float32)))

            self.loss = torch.nn.CrossEntropyLoss()

        def forward(self, input_a, input_b, labels):
            with poptorch.Block(ipu_id=0):
                # Set overlap settings
                input_a = poptorch.set_overlap_for_input(
                    input_a, input_a_overlap)

                input_b = poptorch.set_overlap_for_input(
                    input_b, input_b_overlap)

                # remove leading 1 dim
                input_a = input_a.squeeze()
                input_b = input_b.squeeze()

                to_sum = []

                for idx in range(num_mat_muls):
                    to_sum.append(
                        torch.matmul(self.get_parameter("a" + str(idx)),
                                     input_a))
                    to_sum.append(
                        torch.matmul(self.get_parameter("b" + str(idx)),
                                     input_b))

                sum_all = torch.sum(torch.stack(to_sum, dim=0), dim=0)

                loss = self.loss(sum_all.unsqueeze(dim=0), labels)

                loss = poptorch.set_overlap_for_output(loss, loss_overlap)
                sum_all = poptorch.set_overlap_for_output(
                    sum_all, sum_all_overlap)

                return loss, sum_all

    return Model()


@pytest.mark.ipuHardwareRequired
def test_io_input():
    num_mat_muls = 20
    model = get_model(num_mat_muls,
                      poptorch.OverlapMode.OverlapAccumulationLoop,
                      poptorch.OverlapMode.OverlapAccumulationLoop)
    num_grad_accumulations = 10
    num_device_iterations = 20

    opts = poptorch.Options()
    opts.outputMode(poptorch.OutputMode.All)
    opts.deviceIterations(num_device_iterations)
    opts.setExecutionStrategy(poptorch.ShardedExecution())

    opts.TensorLocations.numIOTiles(32)

    opts.Training.gradientAccumulation(num_grad_accumulations)
    poptorch_model = poptorch.trainingModel(model, options=opts)

    total_batch_size = num_grad_accumulations * num_device_iterations

    input_a = torch.randn((total_batch_size, INPUT_SIZE))
    input_b = torch.randn((total_batch_size, INPUT_SIZE))
    labels = torch.randint(0, 1, (total_batch_size, INPUT_SIZE))

    poptorch_model(input_a, input_b, labels)


@pytest.mark.ipuHardwareRequired
def test_input_error_messages():
    class DoubleInputUseModel(torch.nn.Module):
        def forward(self, x):
            y = x + 1
            x2 = poptorch.set_overlap_for_input(
                x, poptorch.OverlapMode.OverlapAccumulationLoop)
            return y, x2

    model = DoubleInputUseModel()
    poptorch_model = poptorch.inferenceModel(model)

    label = r"[0-9]+"
    err_msg = (r"poptorch.set_overlap_for_input must be the only op applied "
               r"to an input. This is not the case for input " + label +
               r" to the model.")
    with pytest.raises(poptorch.poptorch_core.Error, match=err_msg):
        poptorch_model(torch.tensor([1.0]))

    class NotOnInputModel(torch.nn.Module):
        def forward(self, x):
            y = x + 1
            y2 = poptorch.set_overlap_for_input(
                y, poptorch.OverlapMode.OverlapAccumulationLoop)
            return y, y2

    model = NotOnInputModel()
    poptorch_model = poptorch.inferenceModel(model)

    err_msg = (r"poptorch.set_overlap_for_input applied on a node which is "
               r"not a tensor input to the model.")
    with pytest.raises(poptorch.poptorch_core.Error, match=err_msg):
        poptorch_model(torch.tensor([1.0]))

    class NormalModel(torch.nn.Module):
        def forward(self, x):
            x2 = poptorch.set_overlap_for_input(
                x, poptorch.OverlapMode.OverlapAccumulationLoop)
            y = x2 + 1
            return y

    model = NormalModel()
    poptorch_model = poptorch.inferenceModel(model)

    err_msg = (r"Overlapped IO is not supported with poptorch.Pipelined"
               r"Execution. If you are using only one IPU, please switch to "
               r"poptorch.ShardedExecution.")
    with pytest.raises(poptorch.poptorch_core.Error, match=err_msg):
        poptorch_model(torch.tensor([1.0]))

    opts = poptorch.Options()
    opts.setExecutionStrategy(poptorch.ShardedExecution())
    poptorch_model = poptorch.inferenceModel(model, options=opts)

    err_msg = (r"No IO tiles allocated. You must allocate at least 32 IO tiles"
               r" using poptorch.Options\(\).TensorLocations.numIOTiles.")
    with pytest.raises(poptorch.poptorch_core.Error, match=err_msg):
        poptorch_model(torch.tensor([1.0]))

    opts = opts.clone()
    opts.TensorLocations.numIOTiles(32)
    poptorch_model = poptorch.inferenceModel(model, options=opts)
    poptorch_model(torch.tensor([1.0]))


@pytest.mark.ipuHardwareRequired
def test_overlap_host_io_output():
    num_mat_muls = 20
    model = get_model(num_mat_muls, poptorch.OverlapMode.NoOverlap,
                      poptorch.OverlapMode.NoOverlap,
                      poptorch.OverlapMode.OverlapAccumulationLoop,
                      poptorch.OverlapMode.OverlapAccumulationLoop)

    num_grad_accumulations = 10
    num_device_iterations = 20

    opts = poptorch.Options()
    opts.outputMode(poptorch.OutputMode.All)
    opts.deviceIterations(num_device_iterations)
    opts.setExecutionStrategy(poptorch.ShardedExecution())

    opts.TensorLocations.numIOTiles(32)

    opts.Training.gradientAccumulation(num_grad_accumulations)
    poptorch_model = poptorch.trainingModel(model, options=opts)

    total_batch_size = num_grad_accumulations * num_device_iterations

    input_a = torch.randn((total_batch_size, INPUT_SIZE))
    input_b = torch.randn((total_batch_size, INPUT_SIZE))
    labels = torch.randint(0, 1, (total_batch_size, INPUT_SIZE))

    poptorch_model(input_a, input_b, labels)


@pytest.mark.ipuHardwareRequired
def test_output_error_messages():
    class DoubleOutputUseModel(torch.nn.Module):
        def forward(self, x):
            y = x + 1
            y2 = poptorch.set_overlap_for_output(
                y, poptorch.OverlapMode.OverlapAccumulationLoop)
            return y, y2

    model = DoubleOutputUseModel()
    poptorch_model = poptorch.inferenceModel(model)

    err_msg = (
        r"poptorch.set_overlap_for_output cannot be used with a tensor that "
        r"is returned twice. Please check all returned tensors including "
        r"those nested in tuples/lists.")
    with pytest.raises(poptorch.poptorch_core.Error, match=err_msg):
        poptorch_model(torch.tensor([1.0]))

    opts = poptorch.Options()
    opts.setExecutionStrategy(poptorch.ShardedExecution())

    opts.TensorLocations.numIOTiles(32)

    class MarkedOutputReuseBeforeModel(torch.nn.Module):
        def forward(self, x):
            y = x + 1
            z = y + 1

            y2 = poptorch.set_overlap_for_output(
                y, poptorch.OverlapMode.OverlapAccumulationLoop)
            return y2, z

    model = MarkedOutputReuseBeforeModel()
    poptorch_model = poptorch.inferenceModel(model, options=opts)
    poptorch_model(torch.tensor([1.0]))

    class MarkedOutputReuseAfterModel(torch.nn.Module):
        def forward(self, x):
            y = x + 1
            y2 = poptorch.set_overlap_for_output(
                y, poptorch.OverlapMode.OverlapAccumulationLoop)
            z = y2 + 1
            return y2, z

    model = MarkedOutputReuseAfterModel()
    poptorch_model = poptorch.inferenceModel(model, options=opts)
    poptorch_model(torch.tensor([1.0]))

    class NonOutputMarked(torch.nn.Module):
        def forward(self, x):
            x = poptorch.set_overlap_for_output(
                x, poptorch.OverlapMode.OverlapAccumulationLoop)

            y = x + 1
            return y

    model = NonOutputMarked()
    poptorch_model = poptorch.inferenceModel(model, options=opts)

    err_msg = (r"poptorch.set_overlap_for_output applied on a node which is "
               r"not a tensor output to the model.")

    with pytest.raises(poptorch.poptorch_core.Error, match=err_msg):
        poptorch_model(torch.tensor([1.0]))


def test_overlap_both_non_input_marked():
    class NotOnInputModel(torch.nn.Module):
        def forward(self, x):
            x = poptorch.set_overlap_for_input(
                x, poptorch.OverlapMode.OverlapAccumulationLoop)
            y = x + 1
            y2 = poptorch.set_overlap_for_input(
                y, poptorch.OverlapMode.OverlapAccumulationLoop)
            return y, y2

    opts = poptorch.Options()
    opts.setExecutionStrategy(poptorch.ShardedExecution())
    opts.TensorLocations.numIOTiles(32)

    model = NotOnInputModel()
    poptorch_model = poptorch.inferenceModel(model, opts)

    err_msg = (r"poptorch.set_overlap_for_input applied on a node which is "
               r"not a tensor input to the model.")
    with pytest.raises(poptorch.poptorch_core.Error, match=err_msg):
        poptorch_model(torch.tensor([1.0]))


def test_overlap_both_non_output_marked():
    class OutputBeforeLoss(torch.nn.Module):
        def forward(self, x):
            x = poptorch.set_overlap_for_input(
                x, poptorch.OverlapMode.OverlapAccumulationLoop)
            x = x + torch.ones_like(x)
            x = poptorch.set_overlap_for_output(
                x, poptorch.OverlapMode.OverlapAccumulationLoop)
            return torch.mean(x)

    model = OutputBeforeLoss()

    opts = poptorch.Options()
    opts.setExecutionStrategy(poptorch.ShardedExecution())
    opts.TensorLocations.numIOTiles(32)

    inference_model = poptorch.inferenceModel(model, opts)

    err_msg = (r"poptorch.set_overlap_for_output applied on a node which is "
               r"not a tensor output to the model.")
    with pytest.raises(poptorch.Error, match=err_msg):
        inference_model(torch.tensor([1.0]))


@pytest.mark.ipuHardwareRequired
def test_overlap_tuple():
    class Model(torch.nn.Module):
        def forward(self, xs):
            xs = poptorch.set_overlap_for_input(
                xs, poptorch.OverlapMode.OverlapDeviceIterationLoop)
            x = torch.cat(xs) + 1
            xs = x.chunk(2)
            return poptorch.set_overlap_for_output(
                xs, poptorch.OverlapMode.OverlapAccumulationLoop)

    opts = poptorch.Options()
    opts.setExecutionStrategy(poptorch.ShardedExecution())
    opts.TensorLocations.numIOTiles(32)
    model = poptorch.inferenceModel(Model(), opts)

    xs = torch.arange(8).reshape(4, 2).chunk(2)
    model(xs)


================================================
FILE: tests/phased_execution_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import torch
import torch.nn.functional as F
import pytest
import helpers
import poptorch

# Model: 2x2 S1 ExecutionPhase, repeated N times:
# _____________________________________________________________________________
# phase 0:            IPU 0            |                       IPU 2
# in0 ---- Slice/Slice -----------------------------.
#            |                         |            |
# w0 ----- MatMul                      |          MatMul ----- w1
#            |                         |            |
#          ReLU                        |           ReLU
#            |                         |            |
#            +------------------------.|.-----------+
#______________________________________X__(inter-phase cross-IPU copy)_________
# phase 1:            IPU 1           /|\                      IPU 3
#            .-----------------------' | '----------.
#            |                         |            |
# w2 ----- MatMul                      |          MatMul ----- w3
#            |                         |            |
#          ReLU                        |           ReLU
#            |                         |            |
#            +------------------------.|.-----------+
#                                      X  (intra-phase cross-IPU copy)
#                                     /|\
#            .-----------------------' | '----------.
#            |                         |            |
# w4 ----- MatMul                      |          MatMul ----- w5
#            |                         |            |
#          ReLU                        |           ReLU
#            |                         |            |
#            +------------------------.|.-----------+
#______________________________________X_______________________________________
# phase 2:            IPU 0           /|\                      IPU 2
# ......                               |
# ......                               |
#______________________________________X__(inter-phase cross-IPU copy)_________
# phase N*2-1:        IPU 1           /|\                      IPU 3
#            .-----------------------' | '----------.
#            |                         |            |
# w2 ----- MatMul                      |          MatMul ----- w3
#            |                         |            |
#          ReLU                        |           ReLU
#            |                         |            |
#            +------------------------.|.-----------+
#                                      X  (intra-phase cross-IPU copy)
#                                     /|\
#            .-----------------------' | '----------.
#            |                         |            |
# w4 ----- MatMul                      |          MatMul ----- w5
#            |                         |            |
#          ReLU                        |           ReLU
#            |                         |            |
#            +------------------------------------ Sum ----- L1Loss
#______________________________________|_______________________________________


class LogChecker(helpers.LogChecker):
    def validate_2x2_parallel_phased_execution(self):
        # pylint: disable=line-too-long
        self.assert_contains("enablePipelining set to value 0")
        self.assert_contains("executionPhaseSettings.stages set to value 2")
        self.assert_contains("executionPhaseSettings.phases set to value 6")
        self.assert_contains(
            "location_activation set to value useOnChipStorage(False)")
        self.assert_contains(
            "location_weight set to value useOnChipStorage(False)")
        self.assert_contains(
            "location_optimizer set to value useOnChipStorage(False)")
        self.assert_contains(
            "location_accumulator set to value useOnChipStorage(False)")

        self.assert_contains(
            "Slice:0 [float32(10, 1), mode(Phased), ipu(0), phase(0)]")
        self.assert_contains(
            "Slice:0/1 [float32(10, 1), mode(Phased), ipu(0), phase(0)]")
        self.assert_contains(
            "MatMul:0 [float32(10, 1), mode(Phased), ipu(0), phase(0)]")
        self.assert_contains(
            "Relu:0 [float32(10, 1), mode(Phased), ipu(0), phase(0)]")

        self.assert_contains(
            "MatMul:0/1 [float32(10, 1), mode(Phased), ipu(2), phase(0)]")
        self.assert_contains(
            "Relu:0/1 [float32(10, 1), mode(Phased), ipu(2), phase(0)]")

        self.assert_contains(
            "MatMul:0/2 [float32(10, 1), mode(Phased), ipu(1), phase(1)]")
        self.assert_contains(
            "Relu:0/2 [float32(10, 1), mode(Phased), ipu(1), phase(1)]")

        self.assert_contains(
            "MatMul:0/3 [float32(10, 1), mode(Phased), ipu(3), phase(1)]")
        self.assert_contains(
            "Relu:0/3 [float32(10, 1), mode(Phased), ipu(3), phase(1)]")

        self.assert_contains(
            "MatMul:0/4 [float32(10, 1), mode(Phased), ipu(1), phase(1)]")
        self.assert_contains(
            "Relu:0/4 [float32(10, 1), mode(Phased), ipu(1), phase(1)]")

        self.assert_contains(
            "MatMul:0/5 [float32(10, 1), mode(Phased), ipu(3), phase(1)]")
        self.assert_contains(
            "Relu:0/5 [float32(10, 1), mode(Phased), ipu(3), phase(1)]")

        self.assert_contains(
            "MatMul:0/6 [float32(10, 1), mode(Phased), ipu(0), phase(2)]")
        self.assert_contains(
            "Relu:0/6 [float32(10, 1), mode(Phased), ipu(0), phase(2)]")

        self.assert_contains(
            "MatMul:0/7 [float32(10, 1), mode(Phased), ipu(2), phase(2)]")
        self.assert_contains(
            "Relu:0/7 [float32(10, 1), mode(Phased), ipu(2), phase(2)]")

        self.assert_contains(
            "MatMul:0/8 [float32(10, 1), mode(Phased), ipu(1), phase(3)]")
        self.assert_contains(
            "Relu:0/8 [float32(10, 1), mode(Phased), ipu(1), phase(3)]")

        self.assert_contains(
            "MatMul:0/9 [float32(10, 1), mode(Phased), ipu(3), phase(3)]")
        self.assert_contains(
            "Relu:0/9 [float32(10, 1), mode(Phased), ipu(3), phase(3)]")

        self.assert_contains(
            "MatMul:0/10 [float32(10, 1), mode(Phased), ipu(1), phase(3)]")
        self.assert_contains(
            "Relu:0/10 [float32(10, 1), mode(Phased), ipu(1), phase(3)]")

        self.assert_contains(
            "MatMul:0/11 [float32(10, 1), mode(Phased), ipu(3), phase(3)]")
        self.assert_contains(
            "Relu:0/11 [float32(10, 1), mode(Phased), ipu(3), phase(3)]")

        self.assert_contains(
            "MatMul:0/12 [float32(10, 1), mode(Phased), ipu(0), phase(4)]")
        self.assert_contains(
            "Relu:0/12 [float32(10, 1), mode(Phased), ipu(0), phase(4)]")

        self.assert_contains(
            "MatMul:0/13 [float32(10, 1), mode(Phased), ipu(2), phase(4)]")
        self.assert_contains(
            "Relu:0/13 [float32(10, 1), mode(Phased), ipu(2), phase(4)]")

        self.assert_contains(
            "MatMul:0/14 [float32(10, 1), mode(Phased), ipu(1), phase(5)]")
        self.assert_contains(
            "Relu:0/14 [float32(10, 1), mode(Phased), ipu(1), phase(5)]")

        self.assert_contains(
            "MatMul:0/15 [float32(10, 1), mode(Phased), ipu(3), phase(5)]")
        self.assert_contains(
            "Relu:0/15 [float32(10, 1), mode(Phased), ipu(3), phase(5)]")

        self.assert_contains(
            "MatMul:0/16 [float32(10, 1), mode(Phased), ipu(1), phase(5)]")
        self.assert_contains(
            "Relu:0/16 [float32(10, 1), mode(Phased), ipu(1), phase(5)]")

        self.assert_contains(
            "MatMul:0/17 [float32(10, 1), mode(Phased), ipu(3), phase(5)]")
        self.assert_contains(
            "Relu:0/17 [float32(10, 1), mode(Phased), ipu(3), phase(5)]")
        self.assert_contains(
            "Add:0 [float32(10, 1), mode(Phased), ipu(3), phase(5)]")
        self.assert_contains(
            "Sub:0 [float32(10, 1), mode(Phased), ipu(3), phase(5)]")
        self.assert_contains(
            "L1:0 [float32(), mode(Phased), ipu(3), phase(5)]")
        self.assert_contains(
            "IdentityLoss:0 [float32(), mode(Phased), ipu(3), phase(5)]")
        # pylint: enable=line-too-long

    def validate_2x2_parallel_phased_execution_small(self):
        # pylint: disable=line-too-long
        self.assert_contains("enablePipelining set to value 0")
        self.assert_contains("executionPhaseSettings.stages set to value 2")
        self.assert_contains("executionPhaseSettings.phases set to value 2")
        self.assert_contains(
            "location_activation set to value useOnChipStorage(False)")
        self.assert_contains(
            "location_weight set to value useOnChipStorage(False)")
        self.assert_contains(
            "location_optimizer set to value useOnChipStorage(False)")
        self.assert_contains(
            "location_accumulator set to value useOnChipStorage(False)")

        self.assert_contains(
            "Slice:0 [float32(10, 1), mode(Phased), ipu(0), phase(0)]")
        self.assert_contains(
            "Slice:0/1 [float32(10, 1), mode(Phased), ipu(0), phase(0)]")
        self.assert_contains(
            "MatMul:0 [float32(10, 1), mode(Phased), ipu(0), phase(0)]")
        self.assert_contains(
            "Relu:0 [float32(10, 1), mode(Phased), ipu(0), phase(0)]")

        self.assert_contains(
            "MatMul:0/1 [float32(10, 1), mode(Phased), ipu(2), phase(0)]")
        self.assert_contains(
            "Relu:0/1 [float32(10, 1), mode(Phased), ipu(2), phase(0)]")

        self.assert_contains(
            "MatMul:0/2 [float32(10, 1), mode(Phased), ipu(1), phase(1)]")
        self.assert_contains(
            "Relu:0/2 [float32(10, 1), mode(Phased), ipu(1), phase(1)]")

        self.assert_contains(
            "MatMul:0/3 [float32(10, 1), mode(Phased), ipu(3), phase(1)]")
        self.assert_contains(
            "Relu:0/3 [float32(10, 1), mode(Phased), ipu(3), phase(1)]")

        self.assert_contains(
            "MatMul:0/4 [float32(10, 1), mode(Phased), ipu(1), phase(1)]")
        self.assert_contains(
            "Relu:0/4 [float32(10, 1), mode(Phased), ipu(1), phase(1)]")

        self.assert_contains(
            "MatMul:0/5 [float32(10, 1), mode(Phased), ipu(3), phase(1)]")
        self.assert_contains(
            "Relu:0/5 [float32(10, 1), mode(Phased), ipu(3), phase(1)]")

        self.assert_contains(
            "Add:0 [float32(10, 1), mode(Phased), ipu(3), phase(1)]")
        self.assert_contains(
            "Sub:0 [float32(10, 1), mode(Phased), ipu(3), phase(1)]")
        self.assert_contains(
            "L1:0 [float32(), mode(Phased), ipu(3), phase(1)]")
        self.assert_contains(
            "IdentityLoss:0 [float32(), mode(Phased), ipu(3), phase(1)]")
        # pylint: enable=line-too-long

    def validate_serial_tensor_liveness(self, liveness):
        # 'phases' does not include the bwd pass, so to calculate,
        # sum the number of phases in the fwd pass, plus any phase
        # gap between the end of the fwd and start of the bwd pass
        if liveness == poptorch.Liveness.AlwaysLive:
            # fwd:       bwd:
            # phase 0 -> phase 4
            # phase 1 -> phase 3
            # phase 2 -> phase 2
            phases = 3
            stride = 1
        elif liveness == poptorch.Liveness.OffChipAfterFwd:
            # fwd:       bwd:
            # phase 0 -> phase 8
            # phase 1 -> phase 7
            # phase 2 -> phase 6
            phases = 6
            stride = 1
        elif liveness == poptorch.Liveness.OffChipAfterFwdNoOverlap:
            # fwd:       bwd:
            # phase 0 -> phase 12
            # phase 2 -> phase 10
            # phase 4 -> phase 8
            phases = 8
            stride = 2
        elif liveness == poptorch.Liveness.OffChipAfterEachPhase:
            # fwd:       bwd:
            # phase 0 -> phase 20
            # phase 4 -> phase 16
            # phase 8 -> phase 12
            phases = 12
            stride = 4

        self.assert_contains('set serial_phases_execution to value true')
        self.assert_contains('executionPhaseSettings.stages set to value 1')

        self.assert_contains(
            'executionPhaseSettings.phases set to value {}'.format(phases))

        for phase in range(3):
            op_label = ':0'
            self.assert_contains(
                'Transpose{} [float32({}, {}), mode(Phased), ipu(0), phase({})]'
                .format(op_label, 8 - phase, 7 - phase, phase * stride))
            self.assert_matches(
                (r'(MatMul|Gemm){} \[(float32\({}{}\)|undefined\(shape '
                 r'inference failed\)), mode\(Phased\), ipu\(0\), phase\({}\)]'
                 ).format(op_label, "1, ", 7 - phase, phase * stride))


@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("DEBUG")
def test_2x2_parallel_phased_execution_inline(capfd):
    N = 3
    size = 10

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.weights = torch.nn.ParameterList([
                torch.nn.Parameter(torch.rand(size, size), requires_grad=True)
                for n in range(N * 6)
            ])

        def forward(self, in0, target=None):
            phase = 0
            with poptorch.Block("0", ipu_id=0):
                ins = torch.split(in0, size)
            weight = iter(self.weights)
            for n in range(N * 3):
                out = []
                for ipu in range(2):
                    x = ins[ipu]
                    # Alternate between 0-2 and 1-3
                    ipu = (phase % 2) + ipu * 2
                    with poptorch.Block(f"{phase}", ipu_id=ipu):
                        x = torch.matmul(next(weight), x)
                        out.append(F.relu(x))
                ins = out[1], out[0]
                # We want 2 matmuls in the same phase
                if n % 3 != 1:
                    phase += 1
            with poptorch.Block(f"{N*2-1}", ipu_id=3):
                res = ins[0] + ins[1]
                if target is None:
                    return res
                return res, torch.nn.L1Loss(reduction="mean")(res, target)

    input = torch.rand(size * 2, 1)
    target = torch.rand(size, 1)

    model = Model()

    phases = []
    phases = [f"{n}" for n in range(2 * N)]
    opts = poptorch.Options()
    opts.setExecutionStrategy(poptorch.ParallelPhasedExecution(*phases))
    poptorch_model = poptorch.trainingModel(model, opts)
    poptorch_model.compile(input, target)

    testlog = LogChecker(capfd)
    testlog.validate_2x2_parallel_phased_execution()


@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("DEBUG")
def test_2x2_parallel_phased_execution_opts(capfd):
    N = 3
    size = 10

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.weights = torch.nn.ParameterList([
                torch.nn.Parameter(torch.rand(size, size), requires_grad=True)
                for n in range(N * 6)
            ])

        def forward(self, in0, target=None):
            phase = 0
            weight = iter(self.weights)
            with poptorch.Block("phase0_ipu0"):
                ins = torch.split(in0, size)
            for n in range(N * 3):
                out = []
                for ipu in range(2):
                    x = ins[ipu]
                    with poptorch.Block(f"phase{phase}_ipu{ipu}"):
                        x = torch.matmul(next(weight), x)
                        out.append(F.relu(x))
                ins = out[1], out[0]
                # We want 2 matmuls in the same phase
                if n % 3 != 1:
                    phase += 1
            with poptorch.Block(f"phase{N*2-1}_ipu1"):
                res = ins[0] + ins[1]
                if target is None:
                    return res
                return res, torch.nn.L1Loss(reduction="mean")(res, target)

    input = torch.rand(size * 2, 1)
    target = torch.rand(size, 1)
    model = Model()
    phases = []
    # Alternate between 0-2 and 1-3
    for n in range(N):
        phases.append([
            poptorch.Stage(f"phase{2*n}_ipu0").ipu(0),
            poptorch.Stage(f"phase{2*n}_ipu1").ipu(2)
        ])
        phases.append([
            poptorch.Stage(f"phase{2*n+1}_ipu0").ipu(1),
            poptorch.Stage(f"phase{2*n+1}_ipu1").ipu(3)
        ])
    opts = poptorch.Options()
    opts.setExecutionStrategy(poptorch.ParallelPhasedExecution(*phases))
    poptorch_model = poptorch.trainingModel(model, opts)
    poptorch_model.compile(input, target)

    testlog = LogChecker(capfd)
    testlog.validate_2x2_parallel_phased_execution()


@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("DEBUG")
def test_2x2_parallel_phased_execution_small_opts(capfd):
    size = 10

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.weights = torch.nn.ParameterList([
                torch.nn.Parameter(torch.rand(size, size), requires_grad=True)
                for n in range(6)
            ])

        def forward(self, in0, target=None):
            poptorch.Block.useAutoId()
            weight = iter(self.weights)

            # Phase 0 / ipu 0
            with poptorch.Block():
                in0, in1 = torch.split(in0, size)
                x = torch.matmul(next(weight), in0)
                out0 = F.relu(x)

            # Phase 0 / ipu 2
            with poptorch.Block():
                x = torch.matmul(next(weight), in1)
                out1 = F.relu(x)

            in0, in1 = out1, out0

            # Phase 1 / ipu 1
            with poptorch.Block():
                x = torch.matmul(next(weight), in0)
                out0 = F.relu(x)

            # Phase 1 / ipu 3
            with poptorch.Block():
                x = torch.matmul(next(weight), in1)
                out1 = F.relu(x)

            in0, in1 = out1, out0

            # Phase 1 / ipu 1 - part 2
            with poptorch.Block():
                x = torch.matmul(next(weight), in0)
                out0 = F.relu(x)

            # Phase 1 / ipu 3 - part 2
            with poptorch.Block():
                x = torch.matmul(next(weight), in1)
                out1 = F.relu(x)
                res = out0 + out1
                if target is None:
                    return res
                return res, torch.nn.L1Loss(reduction="mean")(res, target)

    input = torch.rand(size * 2, 1)
    target = torch.rand(size, 1)
    model = Model()
    strategy = poptorch.ParallelPhasedExecution(
        [poptorch.Stage("0"), poptorch.Stage("1")],
        [poptorch.Stage("2", "4"),
         poptorch.Stage("3", "5")])
    # Alternate between 0-2 and 1-3
    strategy.phase(0).ipus(0, 2)
    strategy.phase(1).ipus(1, 3)

    opts = poptorch.Options()
    opts.setExecutionStrategy(strategy)
    poptorch_model = poptorch.trainingModel(model, opts)
    poptorch_model.compile(input, target)

    testlog = LogChecker(capfd)
    testlog.validate_2x2_parallel_phased_execution_small()


@pytest.mark.parametrize("liveness", list(poptorch.Liveness))
@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("DEBUG")
def test_serial_tensor_liveness(capfd, liveness):
    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.fc1 = torch.nn.Linear(8, 7)
            self.fc2 = torch.nn.Linear(7, 6)
            self.fc3 = torch.nn.Linear(6, 5)

        def forward(self, x):
            with poptorch.Block("B1"):
                x = self.fc1(x)
            with poptorch.Block("B2"):
                x = self.fc2(x)
            with poptorch.Block("B3"):
                x = self.fc3(x)
            return x

    strategy = poptorch.SerialPhasedExecution("B1", "B2", "B3")
    strategy.stage("B1").ipu(0)
    strategy.stage("B2").ipu(0)
    strategy.stage("B3").ipu(0)
    strategy.setTensorsLiveness(liveness)
    opts = poptorch.Options()
    opts.setExecutionStrategy(strategy)

    model = Model()
    model = poptorch.inferenceModel(model, opts)

    input = torch.randn(8)
    model.compile(input)

    testlog = LogChecker(capfd)
    testlog.validate_serial_tensor_liveness(liveness)


def test_phased_api():
    # Try to pass a list of Phases
    poptorch.SerialPhasedExecution(
        poptorch.Phase('layer1'),
        poptorch.Phase('layer2'),
    )

    # Try to pass a list of stages
    poptorch.SerialPhasedExecution(
        poptorch.Stage('layer1'),
        poptorch.Stage('layer2'),
    )

    # Try to pass a list of list of stages
    poptorch.SerialPhasedExecution(
        [poptorch.Stage('layer1'),
         poptorch.Stage('layer1.b')],
        [poptorch.Stage('layer2'),
         poptorch.Stage('layer2.b')])

    # Try to pass a list of list of block IDs
    poptorch.SerialPhasedExecution(["layer1"], ["layer2"])


================================================
FILE: tests/pipelining_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import copy
import io
import json
import re
import subprocess
import tempfile
import torch
import pytest
import helpers
import poptorch


@helpers.overridePoptorchLogLevel("DEBUG")
def test_missing_block():
    class Model(torch.nn.Module):
        def forward(self, x):
            poptorch.Block.useAutoId()
            with poptorch.Block(ipu_id=0):
                x = x * 4
            x = x * 4
            return x

    m = Model()

    opts = poptorch.Options()
    opts.deviceIterations(2)
    opts.setExecutionStrategy(
        poptorch.PipelinedExecution(poptorch.AutoStage.AutoIncrement))

    m = poptorch.inferenceModel(m, opts)
    with pytest.raises(poptorch.Error, match="No active Block"):
        m.compile(torch.randn(2, 5))
    assert not poptorch.poptorch_core.isCompilingWithDispatcher(), (
        "[Internal] Clean up failed: dispatcher still active")


@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("DEBUG")
@pytest.mark.parametrize("use_scope", [True, False])
def test_api_inline(capfd, use_scope):
    if use_scope:

        class Model(torch.nn.Module):
            def forward(self, x):
                poptorch.Block.useAutoId()
                with poptorch.Block(ipu_id=0):
                    x = x * 4
                with poptorch.Block(ipu_id=1):
                    x = x * 2
                return x
    else:

        class Model(torch.nn.Module):
            def forward(self, x):
                poptorch.Block.useAutoId()
                poptorch.Block.start(ipu_id=0)
                x = x * 4
                poptorch.Block.start(ipu_id=1)
                x = x * 2
                return x

    m = Model()

    opts = poptorch.Options()
    opts.deviceIterations(2)

    m = poptorch.inferenceModel(m, opts)
    m(torch.randn(2, 5))

    log = helpers.LogChecker(capfd)
    log.assert_contains("enablePipelining set to value 1")
    log.assert_contains(" Mul:0 ", " mode(Pipelined), ipu(0), stage(0)")
    log.assert_contains(" Mul:0/1 ", " mode(Pipelined), ipu(1), stage(1)")


@helpers.overridePoptorchLogLevel("DEBUG")
def run_recomputation_checkpoint_test(size, model_cls, exp_num_stash_ckpted):
    # pylint: disable=protected-access
    dev_its = 2
    grad_accum = 3

    opts = poptorch.Options()
    opts.deviceIterations(dev_its)
    opts.Training.gradientAccumulation(grad_accum)
    opts._Popart.set("autoRecomputation", 3)  # All forward pipeline stages.

    m = poptorch.trainingModel(model_cls(False), opts)
    m.compile(torch.randn(dev_its * grad_accum, size, 1),
              torch.randn(dev_its * grad_accum, size, 1))
    ir = json.loads(m._debugGetPopartIR())
    assert not any("Checkpoint" in node["name"] for node in ir["maingraph"]), (
        "Popart IR shouldn't contain any checkpoint")
    assert sum(["Stash" in node["type"] for node in ir["maingraph"]
                ]) == 1, ("Only the graph input should be stashed")

    native_ckpted = model_cls(True)
    m = poptorch.trainingModel(native_ckpted, opts)
    m.compile(torch.randn(dev_its * grad_accum, size, 1),
              torch.randn(dev_its * grad_accum, size, 1))
    ir = json.loads(m._debugGetPopartIR())  # pylint: disable=protected-access
    assert any(
        "Checkpoint" in node["name"]
        for node in ir["maingraph"]), ("Popart IR should contain a checkpoint")
    assert sum([
        "Stash" in node["type"] for node in ir["maingraph"]
    ]) == exp_num_stash_ckpted, ("Both the graph input and the checkpoint(s) "
                                 "should be stashed")


def test_recomputation_checkpoint_tensor():
    pytest.skip("TODO(T65559): AssertionError: Popart IR should contain a "
                "checkpoint")
    size = 3

    class Model(torch.nn.Module):
        def __init__(self, checkpoint=False):
            super().__init__()
            self.checkpoint = checkpoint
            weight = torch.nn.Parameter(torch.rand(size, size),
                                        requires_grad=True)
            self.register_parameter("weight", weight)

        def forward(self, x, target):
            poptorch.Block.useAutoId()
            with poptorch.Block(ipu_id=0):
                x = torch.matmul(self.weight, x)
                if self.checkpoint:
                    x = poptorch.recomputationCheckpoint(x)
                x = torch.matmul(self.weight, x)

            with poptorch.Block(ipu_id=1):
                x = x * 2
                return x, torch.nn.functional.l1_loss(x, target)

    run_recomputation_checkpoint_test(size, Model, 2)


def test_recomputation_checkpoint_tensor_two_inputs():
    pytest.skip("TODO(T65559): AssertionError: Popart IR should contain a "
                "checkpoint")
    size = 3

    class Model(torch.nn.Module):
        def __init__(self, checkpoint=False):
            super().__init__()
            self.checkpoint = checkpoint
            weight_1 = torch.nn.Parameter(torch.rand(size, size),
                                          requires_grad=True)
            self.register_parameter("weight_1", weight_1)

            weight_2 = torch.nn.Parameter(torch.rand(size, size),
                                          requires_grad=True)
            self.register_parameter("weight_2", weight_2)

        def forward(self, x, target):
            poptorch.Block.useAutoId()
            with poptorch.Block(ipu_id=0):
                x = torch.matmul(self.weight_1, x)
                y = torch.matmul(self.weight_2, x)

                if self.checkpoint:
                    x, y = poptorch.recomputationCheckpoint(x, y)
                x = torch.matmul(self.weight_1, x + y)

            with poptorch.Block(ipu_id=1):
                x = x * 2
                return x, torch.nn.functional.l1_loss(x, target)

    run_recomputation_checkpoint_test(size, Model, 3)


def test_recomputation_checkpoint_tensor_tuple_inputs():
    pytest.skip("TODO(T65559): AssertionError: Popart IR should contain a "
                "checkpoint")
    size = 3

    class Model(torch.nn.Module):
        def __init__(self, checkpoint=False):
            super().__init__()
            self.checkpoint = checkpoint
            weight_1 = torch.nn.Parameter(torch.rand(size, size),
                                          requires_grad=True)
            self.register_parameter("weight_1", weight_1)

            weight_2 = torch.nn.Parameter(torch.rand(size, size),
                                          requires_grad=True)
            self.register_parameter("weight_2", weight_2)

        def forward(self, x, target):
            poptorch.Block.useAutoId()
            with poptorch.Block(ipu_id=0):
                x = torch.matmul(self.weight_1, x)
                y = torch.matmul(self.weight_2, x)

                if self.checkpoint:
                    x, y = poptorch.recomputationCheckpoint((x, y))
                x = torch.matmul(self.weight_1, x + y)

            with poptorch.Block(ipu_id=1):
                x = x * 2
                return x, torch.nn.functional.l1_loss(x, target)

    run_recomputation_checkpoint_test(size, Model, 3)


@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("DEBUG")
def test_api_wrap(capfd):
    """
    stage "0" ipu(0) stage(0) l0 l1 l2
    """

    class Block(torch.nn.Module):
        def forward(self, x):
            return x * 6

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.l1 = Block()
            self.l2 = Block()

        def forward(self, x):
            x = self.l1(x)
            x = self.l2(x)
            return x

    m = Model()
    poptorch.BeginBlock(m.l1, ipu_id=0)
    poptorch.BeginBlock(m.l2, ipu_id=0)

    opts = poptorch.Options()
    opts.deviceIterations(2)

    m = poptorch.inferenceModel(m, opts)
    m(torch.randn(2, 5))

    log = helpers.LogChecker(capfd)
    log.assert_contains("enablePipelining set to value 0")
    log.assert_contains(" l1/Mul:0 ", " mode(Pipelined), ipu(0), stage(0)")
    log.assert_contains(" l2/Mul:0 ", " mode(Pipelined), ipu(0), stage(0)")


@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("DEBUG")
def test_api_wrap_2stages(capfd):
    """
    stage "0" ipu(0) stage(0) l0
    stage "1" ipu(1) stage(1) l1 / l2
    """

    class Block(torch.nn.Module):
        def forward(self, x):
            return x * 6

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.l0 = Block()
            self.l1 = Block()
            self.l2 = Block()

        def forward(self, x):
            x = self.l0(x)
            x = self.l1(x)
            x = self.l2(x)
            return x

    m = Model()
    poptorch.BeginBlock(m.l1, ipu_id=1)
    poptorch.BeginBlock(m.l2, ipu_id=1)

    opts = poptorch.Options()
    opts.deviceIterations(2)

    m = poptorch.inferenceModel(m, opts)
    m(torch.randn(2, 5))

    log = helpers.LogChecker(capfd)
    log.assert_contains("enablePipelining set to value 1")
    log.assert_contains(" l0/Mul:0 ", " mode(Pipelined), ipu(0), stage(0)")
    log.assert_contains(" l1/Mul:0 ", " mode(Pipelined), ipu(1), stage(1)")
    log.assert_contains(" l2/Mul:0 ", " mode(Pipelined), ipu(1), stage(1)")


def test_begin_block_printing():
    class Block(torch.nn.Module):
        def forward(self, x):
            return x * 6

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.l1 = Block()
            self.l2 = Block()

        def forward(self, x):
            x = self.l1(x)
            x = self.l2(x)
            return x

    m = Model()

    begin_l1 = re.compile(r'\(l1\):\s*BeginBlock\(user_id=None, ipu_id=1\)')
    begin_l2 = re.compile(r'\(l2\):\s*BeginBlock\(user_id=None, ipu_id=1\)')

    module_repr = poptorch.module_repr(m)
    assert not "BeginBlock(" in module_repr
    assert not begin_l1.search(module_repr)
    assert not begin_l2.search(module_repr)

    poptorch.BeginBlock(m.l1, ipu_id=1)

    module_repr = poptorch.module_repr(m)
    assert begin_l1.search(module_repr)
    assert not begin_l2.search(module_repr)

    poptorch.BeginBlock(m.l2, ipu_id=1)

    module_repr = poptorch.module_repr(m)
    assert begin_l1.search(module_repr)
    assert begin_l2.search(module_repr)

    opts = poptorch.Options()
    opts.deviceIterations(2)

    module_repr = repr(poptorch.inferenceModel(m, opts))
    assert begin_l1.search(module_repr)
    assert begin_l2.search(module_repr)


@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("DEBUG")
def test_inline_AutoIncrement(capfd):
    class Model(torch.nn.Module):
        def forward(self, x):
            poptorch.Block.useAutoId()
            with poptorch.Block(ipu_id=0):
                x = x * 2
            with poptorch.Block(ipu_id=1):
                x = x * 3
            with poptorch.Block(ipu_id=2):
                x = x * 4
            with poptorch.Block(ipu_id=1):
                x = x * 5
            return x

    m = Model()

    opts = poptorch.Options()
    opts.deviceIterations(4).autoRoundNumIPUs(True)
    opts.setExecutionStrategy(
        poptorch.PipelinedExecution(poptorch.AutoStage.AutoIncrement))

    m = poptorch.inferenceModel(m, opts)
    m.compile(torch.randn(4, 5))

    log = helpers.LogChecker(capfd)
    log.assert_contains("enablePipelining set to value 1")
    log.assert_contains(" Mul:0 ", " mode(Pipelined), ipu(0), stage(1)")
    log.assert_contains(" Mul:0/1 ", " mode(Pipelined), ipu(1), stage(2)")
    log.assert_contains(" Mul:0/2 ", " mode(Pipelined), ipu(2), stage(3)")
    log.assert_contains(" Mul:0/3 ", " mode(Pipelined), ipu(1), stage(4)")


@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("DEBUG")
def test_api_AutoIncrement(capfd):
    class Block(torch.nn.Module):
        def forward(self, x):
            return x * 6

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.l1 = Block()
            self.l2 = Block()
            self.l3 = Block()
            self.l4 = Block()

        def forward(self, x):
            x = self.l1(x)
            x = self.l2(x)
            x = self.l3(x)
            x = self.l4(x)
            return x

    m = Model()
    m.l2 = poptorch.BeginBlock(m.l2, ipu_id=1)
    m.l3 = poptorch.BeginBlock(m.l3, ipu_id=2)
    m.l4 = poptorch.BeginBlock(m.l4, ipu_id=1)

    opts = poptorch.Options()
    opts.deviceIterations(4).autoRoundNumIPUs(True)
    opts.setExecutionStrategy(
        poptorch.PipelinedExecution(poptorch.AutoStage.AutoIncrement))

    m = poptorch.inferenceModel(m, opts)
    m(torch.randn(4, 5))

    log = helpers.LogChecker(capfd)
    log.assert_contains("enablePipelining set to value 1")
    log.assert_contains(" l1/Mul:0 ", " mode(Pipelined), ipu(0), stage(0)")
    log.assert_contains(" l2/Mul:0 ", " mode(Pipelined), ipu(1), stage(1)")
    log.assert_contains(" l3/Mul:0 ", " mode(Pipelined), ipu(2), stage(2)")
    log.assert_contains(" l4/Mul:0 ", " mode(Pipelined), ipu(1), stage(3)")


@pytest.mark.ipuHardwareRequired
def test_ipu_round_up_error():
    class Block(torch.nn.Module):
        def forward(self, x):
            return x * 6

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.l1 = Block()
            self.l2 = Block()
            self.l3 = Block()

        def forward(self, x):
            x = self.l1(x)
            x = self.l2(x)
            x = self.l3(x)
            return x

    m = Model()
    poptorch.BeginBlock(m.l1, ipu_id=0)
    poptorch.BeginBlock(m.l2, ipu_id=1)
    poptorch.BeginBlock(m.l3, ipu_id=2)

    opts = poptorch.Options()
    opts.setExecutionStrategy(
        poptorch.PipelinedExecution(poptorch.AutoStage.AutoIncrement))

    m = poptorch.inferenceModel(m, opts)

    error_msg = (
        ".+The model specifies the use of 3 IPUs, however PopTorch must "
        "reserve a minimum of 4 in order to allow the model to run, "
        "because PopTorch must reserve a power of 2 or maximum of 64 IPUs per "
        r"process\. Please reconfigure your model to use a different number of "
        r"IPUs or set poptorch\.Options\(\)\.autoRoundNumIPUs\(True\)\.")
    with pytest.raises(poptorch.Error, match=error_msg):
        m(torch.randn(4, 5))


class BlockFnModel(torch.nn.Module):
    def forward(self, x):
        poptorch.Block.useAutoId()
        x = self.mult_4(x)
        x = self.mult_2(x)
        return x

    @poptorch.BlockFunction(ipu_id=0)
    def mult_4(self, x):
        return x * 4

    @poptorch.BlockFunction(ipu_id=1)
    def mult_2(self, x):
        return x * 2


@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("DEBUG")
def test_block_function(capfd):
    m = BlockFnModel()

    opts = poptorch.Options()
    opts.deviceIterations(2)

    m = poptorch.inferenceModel(m, opts)
    m(torch.randn(2, 5))

    log = helpers.LogChecker(capfd)
    log.assert_contains("enablePipelining set to value 1")
    log.assert_contains(" Mul:0 ", " mode(Pipelined), ipu(0), stage(0)")
    log.assert_contains(" Mul:0/1 ", " mode(Pipelined), ipu(1), stage(1)")


def test_block_function_saving():
    m = BlockFnModel()
    m = poptorch.inferenceModel(m)

    with tempfile.TemporaryFile() as f:
        torch.save(m, f)


def test_begin_block_functionality():
    class Block(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.relu = torch.nn.ReLU()

            self.l1 = torch.nn.Linear(3, 5)
            self.l2 = torch.nn.Linear(5, 5)
            self.l3 = torch.nn.Linear(5, 3)

        def forward(self, x):
            x = self.relu(self.l1(x))
            x = self.relu(self.l2(x))
            x = self.relu(self.l3(x))
            return x

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.l1 = Block()
            self.l2 = Block()

        def forward(self, x):
            x = self.l1(x)
            with poptorch.Block(ipu_id=2):
                x = self.l2(x)
            return x

    m = Model()

    old_all_names = [n for n, _ in m.named_parameters()]
    old_state_dict = m.state_dict()

    m_l1_wrapped = poptorch.BeginBlock(m.l1, ipu_id=1)

    # The return is for backward compatibility
    assert m_l1_wrapped is m.l1

    assert m.l2.__class__ is Block
    poptorch.BeginBlock(m.l2, ipu_id=2)

    new_all_names = [n for n, _ in m.named_parameters()]
    new_state_dict = m.state_dict()

    assert old_all_names == new_all_names

    sorted_state_dict_keys = sorted(old_state_dict.keys())
    assert sorted_state_dict_keys == sorted(new_state_dict.keys())

    for k in sorted_state_dict_keys:
        helpers.assert_allequal(expected=old_state_dict[k],
                                actual=new_state_dict[k])

    # Strict=True is a sufficient test in itself
    m.load_state_dict(old_state_dict, strict=True)

    # Test dir does not raise an exception
    dir(m.l1)

    # Test registering a buffer
    m.l1.register_buffer("a_buff",
                         torch.nn.parameter.Parameter(torch.zeros(2, 2)))

    buffer_names = [b[0] for b in m.named_buffers()]
    assert "l1.a_buff" in buffer_names

    # Test registering a param
    m.l1.register_parameter("a_param",
                            torch.nn.parameter.Parameter(torch.zeros(2, 2)))

    param_names = [p[0] for p in m.named_parameters()]
    assert "l1.a_param" in param_names

    # Test the model can still be saved
    f = io.BytesIO()
    torch.save(m.state_dict(), f)


def run_in_python_and_get_block_details(model_file_path):
    python_script = (b"import poptorch\n"
                     b"import torch\n"
                     b"with open(\"" + model_file_path.encode('utf-8') +
                     b"\", \"rb\") as f:\n"
                     b"    m = torch.load(f)\n"
                     b"print(poptorch.module_repr(m))\n")

    s = subprocess.Popen(["python3"],
                         stdin=subprocess.PIPE,
                         stdout=subprocess.PIPE)

    return s.communicate(python_script, timeout=10)[0].decode("utf-8")


def test_saving_of_begin_block():
    m = torch.nn.Sequential(torch.nn.Conv2d(3, 10, 5), torch.nn.ReLU(),
                            torch.nn.Conv2d(10, 10, 5), torch.nn.ReLU())

    with tempfile.NamedTemporaryFile() as f:
        torch.save(m, f)

        out = run_in_python_and_get_block_details(f.name)
        assert 'Sequential(' in out
        poptorch.BeginBlock(m, user_id=1, ipu_id=2)

        model_class_before_save = m.__class__

        after_block_save = io.BytesIO()
        torch.save(m, after_block_save)
        assert m.__class__ == model_class_before_save

    with tempfile.NamedTemporaryFile() as f:
        torch.save(m, f)

        out = run_in_python_and_get_block_details(f.name)
        assert 'BeginBlock(user_id=1, ipu_id=2)' in out


def test_begin_block_copy():
    b_1 = torch.nn.Sequential(torch.nn.Conv2d(4, 8, 3), torch.nn.ReLU(),
                              torch.nn.Conv2d(8, 10, 3), torch.nn.ReLU())
    b_2 = torch.nn.Sequential(torch.nn.Conv2d(10, 5, 5), torch.nn.ReLU(),
                              torch.nn.Conv2d(5, 10, 5), torch.nn.ReLU())

    poptorch.BeginBlock(b_1, user_id=1, ipu_id=1)
    poptorch.BeginBlock(b_2, user_id=2, ipu_id=2)

    m = torch.nn.Sequential(b_1, b_2)

    assert "BeginBlock(user_id=1, ipu_id=1)" in poptorch.module_repr(m[0])
    assert "BeginBlock(user_id=2, ipu_id=2)" in poptorch.module_repr(m[1])

    m_copy = copy.copy(m)

    assert "BeginBlock(user_id=1, ipu_id=1)" in poptorch.module_repr(m_copy[0])
    assert "BeginBlock(user_id=2, ipu_id=2)" in poptorch.module_repr(m_copy[1])

    m_deep_copy = copy.deepcopy(m)

    assert "BeginBlock(user_id=1, ipu_id=1)" in poptorch.module_repr(
        m_deep_copy[0])
    assert "BeginBlock(user_id=2, ipu_id=2)" in poptorch.module_repr(
        m_deep_copy[1])


def model_fn(inputs):
    return inputs + 1.0


def test_begin_block_with_function():
    # Legacy use
    block = poptorch.BeginBlock(model_fn, 1, 2)

    # pylint: disable=protected-access
    assert block._user_id == 1
    assert block._ipu_id == 2

    with tempfile.TemporaryFile() as f:
        torch.save(block, f)


@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("DEBUG")
def test_removeBlocks(capfd):
    class Block(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.l1 = torch.nn.ReLU()
            self.l2 = torch.nn.ReLU()
            self.l3 = torch.nn.ReLU()

        def forward(self, x):
            x = self.l1(x)
            x = self.l2(x)
            x = self.l3(x)
            return x

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.b1 = Block()
            self.b2 = Block()

        def forward(self, x):
            x = self.b1(x)
            x = self.b2(x)
            return x

    def compile_model(m):
        opts = poptorch.Options()
        opts.deviceIterations(10)
        if poptorch.ipuHardwareIsAvailable():
            opts.useOfflineIpuTarget()
        poptorch_model = poptorch.inferenceModel(m, opts)
        poptorch_model.compile(torch.randn(10, 3))

    def assert_is_not_pipelined(m):
        compile_model(m)
        log = helpers.LogChecker(capfd)
        log.assert_contains("enablePipelining set to value 0")
        log.assert_contains(" b1/l1/Relu:0 ",
                            " mode(Pipelined), ipu(0), stage(0)")
        log.assert_contains(" b1/l2/Relu:0 ",
                            " mode(Pipelined), ipu(0), stage(0)")
        log.assert_contains(" b1/l3/Relu:0 ",
                            " mode(Pipelined), ipu(0), stage(0)")
        log.assert_contains(" b2/l1/Relu:0 ",
                            " mode(Pipelined), ipu(0), stage(0)")
        log.assert_contains(" b2/l2/Relu:0 ",
                            " mode(Pipelined), ipu(0), stage(0)")
        log.assert_contains(" b2/l3/Relu:0 ",
                            " mode(Pipelined), ipu(0), stage(0)")

    def assert_is_pipelined(m):
        compile_model(m)
        log = helpers.LogChecker(capfd)
        log.assert_contains("enablePipelining set to value 1")
        log.assert_contains(" b1/l1/Relu:0 ",
                            " mode(Pipelined), ipu(0), stage(0)")
        log.assert_contains(" b1/l2/Relu:0 ",
                            " mode(Pipelined), ipu(1), stage(1)")
        log.assert_contains(" b1/l3/Relu:0 ",
                            " mode(Pipelined), ipu(1), stage(1)")
        log.assert_contains(" b2/l1/Relu:0 ",
                            " mode(Pipelined), ipu(2), stage(2)")
        log.assert_contains(" b2/l2/Relu:0 ",
                            " mode(Pipelined), ipu(2), stage(2)")
        log.assert_contains(" b2/l3/Relu:0 ",
                            " mode(Pipelined), ipu(2), stage(2)")

    m = Model()

    poptorch.BeginBlock(m.b1.l2, ipu_id=1)
    poptorch.BeginBlock(m.b2, ipu_id=2)

    assert_is_pipelined(m)

    with pytest.raises(poptorch.Error,
                       match="module has already been assigned to a block"):
        poptorch.BeginBlock(m.b1.l2, ipu_id=1)

    poptorch.removeBlocks(m)

    assert_is_not_pipelined(m)

    poptorch.BeginBlock(m.b1.l2, ipu_id=1)
    poptorch.BeginBlock(m.b2, ipu_id=2)

    assert_is_pipelined(m)


================================================
FILE: tests/pooling_and_padding_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import pytest
import torch
import helpers
import poptorch

# Pools
pool_operators = [
    torch.nn.MaxPool1d, torch.nn.MaxPool2d, torch.nn.MaxPool3d,
    torch.nn.MaxUnpool1d, torch.nn.MaxUnpool2d, torch.nn.MaxUnpool3d,
    torch.nn.AvgPool1d, torch.nn.AvgPool2d, torch.nn.AvgPool3d,
    torch.nn.FractionalMaxPool2d, torch.nn.LPPool1d, torch.nn.LPPool2d,
    torch.nn.AdaptiveMaxPool1d, torch.nn.AdaptiveMaxPool2d,
    torch.nn.AdaptiveMaxPool3d, torch.nn.AdaptiveAvgPool1d,
    torch.nn.AdaptiveAvgPool2d, torch.nn.AdaptiveAvgPool3d
]

# Supported.
pool_1D = [torch.nn.MaxPool1d, torch.nn.AvgPool1d]
pool_2D = [torch.nn.MaxPool2d, torch.nn.AvgPool2d]
pool_3D = [torch.nn.MaxPool3d, torch.nn.AvgPool3d]
adaptive_avg_pool = [
    (torch.nn.AdaptiveAvgPool1d, 1),  # Op, N output dims
    (torch.nn.AdaptiveAvgPool2d, 2),
    (torch.nn.AdaptiveAvgPool3d, 3),
]
# torch.nn.AdaptiveMaxPool2d] # Adaptive max pooling isn't supported due to returning 2 outputs, easy fix.
# TODO (T22978)

# TODO(T25617): PopART does not support PadGradOp when mode is not "constant".
# Ops without grad implementations in PopART
ops_grad_unsupported = (
    torch.nn.ReflectionPad1d,
    torch.nn.ReflectionPad2d,
    torch.nn.ReplicationPad1d,
    torch.nn.ReplicationPad2d,
    torch.nn.ReplicationPad3d,
)


def execute_and_check_wrapper(op, input, check_shape_only=False):

    model = helpers.ModelWithWeights(op, input.shape)
    # Run on CPU.
    native_out, _ = model((input, ))

    test_training = not isinstance(op, ops_grad_unsupported)

    # Run on IPU.
    poptorch_model = poptorch.trainingModel(
        model) if test_training else poptorch.inferenceModel(model)

    poptorch_out, _ = poptorch_model((input, ))

    if not check_shape_only:
        # Inference test - check outputs
        helpers.assert_allclose(actual=poptorch_out, expected=native_out)
    else:
        # This is due to adaptive pooling's process essentially being an implementation detail.
        assert poptorch_out.size() == native_out.size()

    if test_training:
        # Training test - check weights have changed
        poptorch_model.assert_weights_changed()


@pytest.mark.parametrize("op", pool_2D)
def test_pool2D(op):
    torch.manual_seed(42)

    input = torch.randn(1, 2, 10, 10)

    # pool of square window of size=3, stride=2
    model = op(3, stride=2)
    execute_and_check_wrapper(model, input)

    # pool of square window of size=3, stride=2, ceil_mode=True
    model = op(3, stride=2, ceil_mode=True)
    execute_and_check_wrapper(model, input)

    #  pool of non-square window
    model = op((3, 2), stride=(2, 1))
    execute_and_check_wrapper(model, input)

    # pool of square window of size=3, stride=2, padding=1
    model = op(3, stride=2, padding=1)
    execute_and_check_wrapper(model, input)

    if op == torch.nn.AvgPool2d:
        # pool of square window of size=3, stride=2, padding=1, pool excludes padding
        model = op(3, stride=2, padding=1, count_include_pad=False)
        execute_and_check_wrapper(model, input)


@pytest.mark.parametrize("op, n_output_dims", adaptive_avg_pool)
def test_adaptive_avg_pool(op, n_output_dims):
    torch.manual_seed(42)
    # AdaptiveAvgPool1d: [1, 2, 4]       -> [1, 2, 2]
    # AdaptiveAvgPool2d: [1, 2, 4, 6]    -> [1, 2, 2, 3]
    # AdaptiveAvgPool3d: [1, 2, 4, 6, 8] -> [1, 2, 2, 3, 4]
    # TODO(T31335): Match PyTorch's implementation so that we can test cases where
    #               input dims are not divisible by corresponding output dims

    shape = [1, 2]
    shape.extend([2 * i + 4 for i in range(n_output_dims)])

    input = torch.randn(shape)
    output_size = [i + 2 for i in range(n_output_dims)]

    model = op(output_size)

    execute_and_check_wrapper(model, input)


# Padding

one_d_pads = [
    torch.nn.ReflectionPad1d, torch.nn.ReplicationPad1d, torch.nn.ConstantPad1d
]


@pytest.mark.parametrize("op", one_d_pads)
def test_1D_pads(op):
    torch.manual_seed(42)

    # torch.nn.ConstantPad1d, 'torch.nn.ConstantPad2d', 'torch.nn.ConstantPad3d',
    # One D case
    oneDTensor = torch.randn(1, 2, 4)

    # Pad evenly in both directions.

    if op == torch.nn.ConstantPad1d:
        model = op(2, 4.7)
    else:
        model = op(3)
    execute_and_check_wrapper(model, oneDTensor)

    # Pad unevenly in both directions.
    if op == torch.nn.ConstantPad1d:
        model = op((3, 2), 0.12456)
    else:
        model = op((3, 2))
    execute_and_check_wrapper(model, oneDTensor)


two_d_pads = [
    torch.nn.ReflectionPad2d, torch.nn.ReplicationPad2d,
    torch.nn.ConstantPad2d, torch.nn.ZeroPad2d
]


@pytest.mark.parametrize("op", two_d_pads)
def test_2D_pads(op):
    # 2D Case
    twoDTensor = torch.randn(1, 2, 4, 4)

    # Pad evenly in all directions.

    if op == torch.nn.ConstantPad2d:
        model = op(6, 2.3)
    else:
        model = op(2)
    execute_and_check_wrapper(model, twoDTensor)

    # Pad unevenly in all directions.
    if op == torch.nn.ConstantPad2d:
        model = op((3, 2, 1, 5), 4.7)
    else:
        model = op((3, 2, 1, 3))

    execute_and_check_wrapper(model, twoDTensor)


three_d_pads = [torch.nn.ReplicationPad3d, torch.nn.ConstantPad3d]


@pytest.mark.parametrize("op", three_d_pads)
def test_3D_pads(op):
    # 3D Case
    threeDTensor = torch.randn(1, 2, 4, 4, 4)

    # Pad evenly in all directions.
    if op == torch.nn.ConstantPad3d:
        model = op(2, 6.4)
    else:
        model = op(3)
    execute_and_check_wrapper(model, threeDTensor)

    # Pad unevenly in all directions.
    if op == torch.nn.ConstantPad3d:
        model = op((3, 2, 1, 5, 3, 4), 7.2)
    else:
        model = op((3, 2, 1, 5, 3, 4))
    execute_and_check_wrapper(model, threeDTensor)


def test_constant_pad_less_dims():
    torch.manual_seed(42)

    class Model(torch.nn.Module):
        def forward(self, x):
            # Only pad the last dimension of input
            return torch.nn.functional.pad(x, [1, 2])

    x = torch.randn(1, 2, 3, 4)

    execute_and_check_wrapper(Model(), x)


def test_constant_pad_n_dims():
    torch.manual_seed(42)

    class Model(torch.nn.Module):
        def forward(self, x):
            # Pad left/right dims by 1 and 2 respectively, for every dim
            return torch.nn.functional.pad(x, [(i % 2) + 1 for i in range(8)])

    x = torch.randn(1, 2, 3, 4)

    execute_and_check_wrapper(Model(), x)


================================================
FILE: tests/popdist_test.py
================================================
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.

import pytest

import poptorch


# pylint: disable=import-outside-toplevel
def test_blocked_options():
    try:
        import popdist.poptorch
    except ModuleNotFoundError:
        pytest.skip(
            "Unable to import popdist: possibly a Python version mismatch?")
    opts = popdist.poptorch.Options(ipus_per_replica=2)

    with pytest.raises(
            RuntimeError,
            match=r"Cannot call `useIpuId` with popdist\.poptorch\.Options"):
        opts.useIpuId(1)

    with pytest.raises(RuntimeError,
                       match=r"Cannot call `replicationFactor` with "
                       r"popdist\.poptorch\.Options"):
        opts.replicationFactor(1)

    with pytest.raises(RuntimeError,
                       match=r"Cannot call `Distributed.disable` with "
                       r"popdist\.poptorch\.Options"):
        opts.Distributed.disable()

    with pytest.raises(RuntimeError,
                       match=r"Cannot call `Distributed.setEnvVarNames` with "
                       r"popdist\.poptorch\.Options"):
        opts.Distributed.setEnvVarNames("A", "B")

    with pytest.raises(
            RuntimeError,
            match=r"Cannot call `Distributed.configureProcessId` with "
            r"popdist\.poptorch\.Options"):
        opts.Distributed.configureProcessId(1)


# pylint: disable=import-outside-toplevel
def test_getters():
    try:
        import popdist.poptorch
    except ModuleNotFoundError:
        pytest.skip(
            "Unable to import popdist: possibly a Python version mismatch?")

    opts = popdist.poptorch.Options(ipus_per_replica=2)

    assert opts.Distributed.processId == 0
    assert opts.Distributed.numProcesses == 1


# pylint: disable=protected-access,import-outside-toplevel
@pytest.mark.ipuHardwareRequired
def test_to_dict():
    try:
        import popdist.poptorch
    except ModuleNotFoundError:
        pytest.skip(
            "Unable to import popdist: possibly a Python version mismatch?")
    opts = popdist.poptorch.Options(ipus_per_replica=2)
    opts.outputMode(poptorch.enums.OutputMode.All)
    opts.toDict()

    # Should not be frozen here
    opts.checkIsFrozen()

    opts._freeze()

    # Should unfeeze and freeze again
    opts.toDict()

    with pytest.raises(AttributeError, match=r"Can't modify frozen Options"):
        opts.checkIsFrozen()


================================================
FILE: tests/poplar_executor_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import datetime
import unittest.mock
import os
import re
import tempfile
import glob
import warnings

import pytest
import torch
import torch.multiprocessing as mp
import helpers
import poptorch


@pytest.mark.ipuHardwareRequired
@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("DEBUG")
def test_ExecutableCaching(capfd):
    class Model(torch.nn.Module):
        def forward(self, x):
            return x * 6

    with tempfile.TemporaryDirectory() as cache:
        opts = poptorch.Options()
        opts.enableExecutableCaching(cache)
        m = poptorch.inferenceModel(Model(), opts)
        m.compile(torch.rand(2, 3))
        m.destroy()
        log = helpers.LogChecker(capfd)
        log.assert_contains("set enableEngineCaching to value true")
        assert len(os.listdir(cache)) == 1, "No executable saved in the cache"

        n = poptorch.inferenceModel(Model(), opts)
        n.compile(torch.rand(2, 3))
        log = helpers.LogChecker(capfd)
        log.assert_contains("set enableEngineCaching to value true")


@pytest.mark.ipuHardwareRequired
@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("DEBUG")
def test_ExecutableCaching_env(capfd):
    class Model(torch.nn.Module):
        def forward(self, x):
            return x * 6

    with tempfile.TemporaryDirectory() as cache:
        os.environ["POPTORCH_CACHE_DIR"] = cache
        opts = poptorch.Options()
        m = poptorch.inferenceModel(Model(), opts)
        m.compile(torch.rand(2, 3))
        m.destroy()
        log = helpers.LogChecker(capfd)
        log.assert_contains("set enableEngineCaching to value true")
        assert len(os.listdir(cache)) == 1, "No executable saved in the cache"

        n = poptorch.inferenceModel(Model(), opts)
        n.compile(torch.rand(2, 3))
        log = helpers.LogChecker(capfd)
        log.assert_contains("set enableEngineCaching to value true")


class Network(torch.nn.Module):
    def forward(self, x, y):
        return x + y


def _create_model_and_export(opts, filename):
    model = Network()

    inference_model = poptorch.inferenceModel(model, opts)
    x = torch.ones(2)
    y = torch.zeros(2)

    inference_model.compileAndExport(filename, x, y)
    assert os.path.exists(filename)


@unittest.mock.patch.dict("os.environ", helpers.disableAllModels())
def test_offline_ipu_compileAndExport_file(filename=None):
    # Force-disable the IPU model
    opts = poptorch.Options().useOfflineIpuTarget()

    with tempfile.TemporaryDirectory() as tmp:
        filename = os.path.join(tmp, "model.poptorch")
        _create_model_and_export(opts, filename)


@pytest.mark.ipuHardwareRequired
def test_precompile_then_load():
    opts = poptorch.Options().useOfflineIpuTarget(
        poptorch.ipuHardwareVersion())
    with tempfile.TemporaryDirectory() as tmp:
        filename = os.path.join(tmp, "model.poptorch")
        _create_model_and_export(opts, filename)

        poptorch_model = poptorch.load(filename)

        x = torch.tensor([1., 2.])
        y = torch.tensor([3., 4.])
        # Check the user model was restored
        helpers.assert_allclose(actual=poptorch_model.model(x, y),
                                expected=torch.tensor([4., 6.]))
        helpers.assert_allclose(actual=poptorch_model(x, y),
                                expected=torch.tensor([4., 6.]))


@unittest.mock.patch.dict("os.environ", helpers.disableAllModels())
def test_offline_ipu_compileAndExport_dir():
    class Network(torch.nn.Module):
        def forward(self, x, y):
            return x + y

    model = Network()
    # Force-disable the IPU model
    opts = poptorch.Options().useOfflineIpuTarget()
    poptorch.inferenceModel(model, opts)

    inference_model = poptorch.inferenceModel(model, opts)
    x = torch.ones(2)
    y = torch.zeros(2)

    with tempfile.TemporaryDirectory() as tmp:
        assert os.path.isdir(tmp)
        # Model is local to the function: it cannot be serialised so don't
        # export it.
        inference_model.compileAndExport(tmp, x, y, export_model=False)
        files = glob.glob(f"{tmp}/*")
        assert len(files) == 1, "Expected exactly 1 file"


def test_inference_attributes():
    class Model(torch.nn.Module):
        def __init__(self, attr):
            super().__init__()
            self.attr = attr

        def getAttr(self):
            return self.attr

        def forward(self, x, y):
            return x + y + 5

    poptorch_model = poptorch.inferenceModel(Model("MyAttr"))

    t1 = torch.tensor([1.])
    t2 = torch.tensor([2.])

    poptorch_model(t1, t2)

    assert poptorch_model.getAttr() == poptorch_model.attr
    assert poptorch_model.attr == "MyAttr"


def test_training_attributes():
    def custom_loss(output, target):
        # Mean squared error with a scale
        loss = output - target
        loss = loss * loss * 5
        return poptorch.identity_loss(loss, reduction="mean")

    class Model(torch.nn.Module):
        def __init__(self, attr):
            super().__init__()
            self.bias = torch.nn.Parameter(torch.zeros(()))
            self.attr = attr

        def getAttr(self):
            return self.attr

        def forward(self, x, target):
            x = x + 1
            x = poptorch.ipu_print_tensor(x) + self.bias
            return x, custom_loss(x, target)

    model = Model("MyAttr")
    input = torch.tensor([1.0, 2.0, 3.0])
    target = torch.tensor([30.0, 40.0, 50.0])
    poptorch_model = poptorch.trainingModel(model)

    poptorch_model(input, target)

    assert poptorch_model.getAttr() == poptorch_model.attr
    assert poptorch_model.attr == "MyAttr"


@pytest.mark.ipuHardwareRequired
@pytest.mark.parametrize("use_half", [False])
def test_explicit_destroy(use_half):
    class ExampleModel(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.bias = torch.nn.Parameter(torch.zeros(()))

        def forward(self, x):
            x = x + 1

            # It is important to make sure the result of the print is used.
            x = poptorch.ipu_print_tensor(x)

            return x + self.bias

    def custom_loss(output, target):
        # Mean squared error with a scale
        loss = output - target
        loss = loss * loss * 5
        return poptorch.identity_loss(loss, reduction="mean")

    class ExampleModelWithCustomLoss(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.model = ExampleModel()

        def forward(self, input, target=None):
            out = self.model(input)
            if target is not None:
                return out, custom_loss(out, target)
            return out

    opts = poptorch.Options()
    # Both models will use the same IPU device.
    opts.useIpuId(1)

    model = ExampleModelWithCustomLoss()
    input = torch.tensor([1.0, 2.0, 3.0])
    target = torch.tensor([30.0, 40.0, 50.0])
    if use_half:
        model.half()
        input = input.half()
        target = target.half()
    training_model = poptorch.trainingModel(model, opts)
    inference_model = poptorch.inferenceModel(model, opts)

    training_model(input=input, target=target)
    training_model.destroy()

    error_msg = r"Model has not been compiled or has been destroyed."
    with pytest.raises(poptorch.Error, match=error_msg):
        training_model.copyWeightsToHost()
    with pytest.raises(poptorch.Error, match=error_msg):
        training_model.copyWeightsToDevice()

    inference_model(input)


def _compile_model_offline(cache, pid, num_processes):
    poptorch.setLogLevel("DEBUG")  # Force debug logging in worker process
    opts = poptorch.Options().useOfflineIpuTarget()
    opts.enableExecutableCaching(cache)
    # Disable compilation bar to avoid issues with capfd
    opts.showCompilationProgressBar(False)
    opts.deviceIterations(10)
    opts.Distributed.configureProcessId(pid, num_processes)

    class ModelWithLoss(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.linear = torch.nn.Linear(10, 10)
            self.loss = torch.nn.CrossEntropyLoss()

        def forward(self, data, target):
            out = self.linear(data)
            loss = self.loss(out, target)
            return out, loss

    model = ModelWithLoss()
    poptorch_model = poptorch.trainingModel(model, options=opts)

    # 10 Batches of 10.
    input = torch.randn(10, 10)
    # 10 batches of 1
    label = torch.randint(0, 10, [1])
    label = label.expand([10])

    poptorch_model.compile(input, label)


# Force-disable the IPU model
@unittest.mock.patch.dict("os.environ", helpers.disableAllModels())
@helpers.printCapfdOnExit
def test_distributed_compile(capfd):
    num_processes = 6
    with tempfile.TemporaryDirectory() as tmp:
        cache = os.path.join(tmp, "poptorch_cache")

        ctx = mp.get_context('spawn')
        processes = [
            ctx.Process(target=_compile_model_offline,
                        args=(cache, pid, num_processes))
            for pid in range(num_processes)
        ]
        for p in processes:
            p.start()
        for p in processes:
            p.join()

    def getTimestamp(line):
        m = re.match(r"\[([\d:.]+)\]", line)
        return datetime.datetime.strptime(m.group(1), "%H:%M:%S.%f")

    log = helpers.LogChecker(capfd).createIterator()
    includes_compilation = True
    for p in processes:
        start = getTimestamp(log.findNext("cache file locked"))
        end = getTimestamp(log.findNext("released the cache lock"))

        if includes_compilation:
            assert end - start > datetime.timedelta(seconds=1), (
                "Expected the"
                " first process model compilation to take more than 1 "
                f"second but it took {end - start}")
        else:
            assert end - start < datetime.timedelta(seconds=1), (
                "Expected "
                "processes to load the executable from the cache in under"
                f" 1 second but it took {end - start}")
        includes_compilation = False


def test_cpu_output():
    const1 = torch.tensor([1, 2])
    const2 = torch.tensor([3, 4])

    class Model(torch.nn.Module):
        def forward(self):
            return (const1 + const2, ([const1, const2], [const1,
                                                         const2]), const2)

    model = Model()
    with warnings.catch_warnings(record=True) as filtered_warnings:
        poptorch.inferenceModel(model).compile()

    pop_warns = set(str(w.message) for w in filtered_warnings)

    expected_warning = "Output expected to be on the IPU but is on cpu"

    for r in pop_warns:
        assert expected_warning in r, (f"Compilation generated unexpected "
                                       f"warning.\nActual warning: {r}")


@pytest.mark.ipuHardwareRequired
def test_get_cycles_error_msgs():
    class Model(torch.nn.Module):
        def forward(self, x, y):
            return x + y

    inference_model = poptorch.inferenceModel(Model())

    error_msg = (r"Cycle count logging is disabled. Please set option "
                 r"logCycleCount to True to enable.")
    with pytest.raises(poptorch.Error, match=error_msg):
        inference_model.cycleCount()

    opts = poptorch.Options()
    opts.logCycleCount(True)

    inference_model = poptorch.inferenceModel(Model(), options=opts)

    error_msg = (r"Please run the model at least once before obtaining cycle "
                 r"count.")
    with pytest.raises(poptorch.Error, match=error_msg):
        inference_model.cycleCount()

    inference_model.compile(torch.Tensor([1.0]), torch.Tensor([2.0]))

    error_msg = (r"Please run the model at least once before obtaining cycle "
                 r"count.")
    with pytest.raises(poptorch.Error, match=error_msg):
        inference_model.cycleCount()

    inference_model(torch.Tensor([3.0]), torch.Tensor([4.0]))
    assert inference_model.cycleCount() > 0


@pytest.mark.skipif(poptorch.ipuHardwareIsAvailable(),
                    reason="Test error message when no hardware")
def test_get_cycles_no_hw():
    class Model(torch.nn.Module):
        def forward(self, x, y):
            return x + y

    inference_model = poptorch.inferenceModel(Model())

    opts = poptorch.Options()
    opts.logCycleCount(True)

    inference_model = poptorch.inferenceModel(Model(), options=opts)

    error_msg = (
        r"Cycle count logging is only supported on actual IPU hardware.")
    with pytest.raises(poptorch.Error, match=error_msg):
        inference_model(torch.Tensor([3.0]), torch.Tensor([4.0]))


def test_get_compilation_time():
    class Model(torch.nn.Module):
        def forward(self, x, y):
            return x + y

    no_compilation_time_opts = poptorch.Options()
    no_compilation_time_opts.showCompilationProgressBar(False)
    no_compilation_time_model = poptorch.inferenceModel(
        Model(), options=no_compilation_time_opts)

    compilation_time_opts = poptorch.Options()
    compilation_time_opts.showCompilationProgressBar(True)
    compilation_time_model = poptorch.inferenceModel(
        Model(), options=compilation_time_opts)

    error_msg = (
        r"Please compile the model before obtaining compilation time.")

    with pytest.raises(poptorch.Error, match=error_msg):
        no_compilation_time_model.compilationTime()

    with pytest.raises(poptorch.Error, match=error_msg):
        compilation_time_model.compilationTime()

    error_msg = (
        r"Please set showCompilationProgressBar option to obtain compilation "
        r"time.")

    with pytest.raises(poptorch.Error, match=error_msg):
        no_compilation_time_model(torch.Tensor([3.0]), torch.Tensor([4.0]))
        no_compilation_time_model.compilationTime()

    compilation_time_model(torch.Tensor([3.0]), torch.Tensor([4.0]))
    compilation_time = compilation_time_model.compilationTime()

    assert compilation_time > datetime.timedelta(seconds=1)


@pytest.mark.parametrize("rewrap_executor", [True, False])
def test_rewrap_model(rewrap_executor):
    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.fc = torch.nn.Linear(1, 1)
            self.loss = torch.nn.L1Loss()

        def forward(self, x):
            y = self.fc(x)
            loss = self.loss(y, x + 1)

            return loss

    model = Model()

    # Normal running
    torch.nn.init.ones_(model.fc.weight)
    torch.nn.init.zeros_(model.fc.bias)

    opts = poptorch.Options()
    opts.deviceIterations(10)
    poptorch_model = poptorch.trainingModel(model, options=opts)

    poptorch_model(torch.ones([10]))

    bias_after_1000 = float(model.fc.bias)

    # Try rewrapping model half way
    torch.nn.init.ones_(model.fc.weight)
    torch.nn.init.zeros_(model.fc.bias)

    with pytest.raises(AssertionError):
        helpers.assert_allclose(actual=model.fc.bias, expected=bias_after_1000)

    model.destroy()

    opts = poptorch.Options()
    opts.deviceIterations(5)
    poptorch_model = poptorch.trainingModel(model, options=opts)

    poptorch_model(torch.ones([5]))

    err_msg = (r"Model has already been wrapped in 'poptorch.trainingModel'."
               r" Call model.destroy\(\) on the model to unwrap before "
               "wrapping again.")
    with pytest.raises(RuntimeError, match=err_msg):
        poptorch_model = poptorch.trainingModel(model, options=opts)

    # re-wrap for test
    if rewrap_executor:
        poptorch_model.destroy()
        poptorch_model = poptorch.trainingModel(poptorch_model, options=opts)
    else:
        model.destroy()
        poptorch_model = poptorch.trainingModel(model, options=opts)

    poptorch_model(torch.ones([5]))
    helpers.assert_allclose(actual=float(model.fc.bias),
                            expected=bias_after_1000)


================================================
FILE: tests/precompilation_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.
import os
import re
import marshal
import subprocess
import sys
import json
import pathlib
import tempfile
import unittest.mock

import pytest
import torch
import helpers
import poptorch


class ExampleModelWithLoss(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = torch.nn.Linear(10, 10)
        self.loss = torch.nn.CrossEntropyLoss()

    def forward(self, x, target=None):
        fc = self.fc(x)
        if self.training:
            return fc, self.loss(fc, target)
        return fc


def _createExampleModel(training, offline_target=False):
    torch.manual_seed(42)
    model = ExampleModelWithLoss()

    opts = poptorch.Options()
    if offline_target:
        opts.useOfflineIpuTarget(poptorch.ipuHardwareVersion())

    if training:
        model.train()
        poptorch_model = poptorch.trainingModel(model, opts)
    else:
        model.eval()
        poptorch_model = poptorch.inferenceModel(model, opts)
    return poptorch_model


def _compileAndExport(filename, export_model=True, training=True):
    poptorch_model = _createExampleModel(training, True)

    input = torch.randn(1, 10)
    target = torch.randint(0, 10, [1])

    if training:
        poptorch_model.compileAndExport(filename,
                                        input,
                                        target,
                                        export_model=export_model)
    else:
        poptorch_model.compileAndExport(filename,
                                        input,
                                        export_model=export_model)
    poptorch_model.destroy()
    return input, target


@pytest.mark.ipuHardwareRequired
def test_export_then_load_live_model():
    with tempfile.TemporaryDirectory() as tmp:
        filename = os.path.join(tmp, "model.poptorch")
        poptorch_model = _createExampleModel(training=False)

        input = torch.randn(1, 10)
        # Running the model will trigger the executable compilation
        poptorch_model(input)
        # Save the executable and destroy the model
        poptorch_model.save(filename)
        poptorch_model.destroy()

        # Reload the model from file and run it.
        poptorch_model = poptorch.load(filename)
        poptorch_model(input)


@pytest.mark.ipuHardwareRequired
def test_export_then_load():
    with tempfile.TemporaryDirectory() as tmp:
        filename = os.path.join(tmp, "model.poptorch")
        input, target = _compileAndExport(filename)

        poptorch_model = poptorch.load(filename)
        poptorch_model(input, target)


@pytest.mark.ipuHardwareRequired
def test_export_then_load_setIpu():
    with tempfile.TemporaryDirectory() as tmp:
        filename = os.path.join(tmp, "model.poptorch")
        input, target = _compileAndExport(filename)

        def setIpuDevice(opts):
            opts.useIpuId(1)  # always use IPU 1

        poptorch_model = poptorch.load(filename, edit_opts_fn=setIpuDevice)
        poptorch_model(input, target)


@pytest.mark.ipuHardwareRequired
def test_export_no_python_then_load():
    with tempfile.TemporaryDirectory() as tmp:
        filename = os.path.join(tmp, "model.poptorch")
        input, target = _compileAndExport(filename, export_model=False)

        # load_exe_start
        model = ExampleModelWithLoss()

        opts = poptorch.Options()
        poptorch_model = poptorch.trainingModel(model, opts)
        poptorch_model.loadExecutable(filename)

        poptorch_model(input, target)


@pytest.mark.ipuHardwareRequired
def test_export_train_validate_no_python():
    with tempfile.TemporaryDirectory() as tmp:
        train_filename = os.path.join(tmp, "train.poptorch")
        valid_filename = os.path.join(tmp, "valid.poptorch")
        input, target = _compileAndExport(train_filename, export_model=False)
        _compileAndExport(valid_filename, export_model=False, training=False)

        model = ExampleModelWithLoss()
        options = poptorch.Options()
        training_model = poptorch.trainingModel(model, options=options)
        training_model.loadExecutable(train_filename)

        model.eval()
        validation_model = poptorch.inferenceModel(model, options)
        validation_model.loadExecutable(valid_filename)

        # Make sure the first run doesn't already pass the test.
        out, original_loss = training_model(input, target)
        assert torch.argmax(out, dim=1) != target

        out = validation_model(input)
        assert torch.argmax(out, dim=1) != target

        for _ in range(500):
            out, loss = training_model(input, target)

        # Check we have trained the model
        assert loss < original_loss
        assert loss < 0.05
        assert torch.argmax(out, dim=1) == target

        # Check validation model has the weights
        out = validation_model(input)
        assert torch.argmax(out, dim=1) == target


@pytest.mark.ipuHardwareRequired
def test_export_train_validate():
    with tempfile.TemporaryDirectory() as tmp:
        train_filename = os.path.join(tmp, "train.poptorch")
        valid_filename = os.path.join(tmp, "valid.poptorch")
        input, target = _compileAndExport(train_filename)
        _compileAndExport(valid_filename, training=False)

        training_model = poptorch.load(train_filename)
        options = poptorch.Options()
        validation_model = poptorch.inferenceModel(training_model, options)
        validation_model.model.eval()
        validation_model.loadExecutable(valid_filename)

        # Make sure the first run doesn't already pass the test.
        out, original_loss = training_model(input, target)
        assert torch.argmax(out, dim=1) != target

        out = validation_model(input)
        assert torch.argmax(out, dim=1) != target

        for _ in range(500):
            out, loss = training_model(input, target)

        # Check we have trained the model
        assert loss < original_loss
        assert loss < 0.05
        assert torch.argmax(out, dim=1) == target

        # Check validation model has the weights
        out = validation_model(input)
        assert torch.argmax(out, dim=1) == target


@pytest.mark.ipuHardwareRequired
def test_export_train_save_validate():
    with tempfile.TemporaryDirectory() as tmp:
        train_filename = os.path.join(tmp, "train.poptorch")
        valid_filename = os.path.join(tmp, "valid.poptorch")
        input, target = _compileAndExport(train_filename)

        training_model = poptorch.load(train_filename)
        opts = poptorch.Options()
        opts.useOfflineIpuTarget(poptorch.ipuHardwareVersion())
        validation_model = poptorch.inferenceModel(training_model, opts)
        validation_model.model.eval()

        # Make sure the first run doesn't already pass the test.
        out, original_loss = training_model(input, target)
        assert torch.argmax(out, dim=1) != target

        # Now train the model
        for _ in range(500):
            out, loss = training_model(input, target)

        # Check we have trained the model
        assert loss < original_loss
        assert loss < 0.05
        assert torch.argmax(out, dim=1) == target

        validation_model.compileAndExport(valid_filename, input)
        validation_model = poptorch.load(valid_filename)

        # Check validation model has the weights
        out = validation_model(input)
        assert torch.argmax(out, dim=1) == target


@pytest.mark.ipuHardwareRequired
def test_export_train_save_train():
    with tempfile.TemporaryDirectory() as tmp:
        train_filename = os.path.join(tmp, "train.poptorch")
        weights_filename = os.path.join(tmp, "weights.poptorch")
        input, target = _compileAndExport(train_filename)

        training_model = poptorch.load(train_filename)

        # Make sure the first run doesn't already pass the test.
        out, original_loss = training_model(input, target)
        assert torch.argmax(out, dim=1) != target

        # Now train the model
        for _ in range(500):
            out, loss = training_model(input, target)

        # Check we have trained the model
        assert loss < original_loss
        assert loss < 0.05
        assert torch.argmax(out, dim=1) == target

        torch.save(training_model.model.state_dict(), weights_filename)
        training_model.destroy()

        training_model = poptorch.load(train_filename)
        training_model.load_state_dict(torch.load(weights_filename))

        # Check we still have the trained weights
        out, loss = training_model(input, target)
        assert loss < original_loss
        assert loss < 0.05
        assert torch.argmax(out, dim=1) == target


@pytest.mark.ipuHardwareRequired
def test_export_train_save_validate_load_weights():
    with tempfile.TemporaryDirectory() as tmp:
        train_filename = os.path.join(tmp, "train.poptorch")
        valid_filename = os.path.join(tmp, "valid.poptorch")
        weights_filename = os.path.join(tmp, "weights.poptorch")
        _compileAndExport(valid_filename, training=False)
        input, target = _compileAndExport(train_filename)

        training_model = poptorch.load(train_filename)

        # Make sure the first run doesn't already pass the test.
        out, original_loss = training_model(input, target)
        assert torch.argmax(out, dim=1) != target

        # Now train the model
        for _ in range(500):
            out, loss = training_model(input, target)

        # Check we have trained the model
        assert loss < original_loss
        assert loss < 0.05
        assert torch.argmax(out, dim=1) == target

        torch.save(training_model.model, weights_filename)
        training_model.destroy()

        validation_model = poptorch.load(valid_filename)
        validation_model.load_state_dict(
            torch.load(weights_filename).state_dict())

        # Check validation model has the weights
        out = validation_model(input)
        assert torch.argmax(out, dim=1) == target


def process_to_generate_profiling_data():
    """A function executed as a script running in a separate process.
    We need to do this because profiling data is only written to disk
    when a process exits.
    """
    # pylint: disable=import-outside-toplevel
    # pylint: disable=reimported
    import poptorch
    import torch

    class Block(torch.nn.Module):
        def __init__(self, num_hidden):
            super().__init__()
            self.softmax = torch.nn.LogSoftmax(1)
            self.lstm = torch.nn.LSTM(3, num_hidden)

        def forward(self, x):
            x, _ = self.lstm(x)
            x = self.softmax(x)
            return x

    class Model(torch.nn.Module):
        def __init__(self, num_hidden):
            super().__init__()
            self.relu = torch.nn.ReLU()
            self.block0 = Block(num_hidden)

        def forward(self, x):
            x = self.block0(x)
            x = self.relu(x)
            loss = poptorch.identity_loss(x**2, reduction='sum')
            return x, loss

    input = torch.randn(1, 1, 3)
    model = Model(3)

    optimizer = poptorch.optim.SGD(model.parameters(), lr=0.0)

    opts = poptorch.Options()
    opts.useOfflineIpuTarget()
    training_model = poptorch.trainingModel(model, opts, optimizer=optimizer)

    training_model.compile(input)


@unittest.mock.patch.dict(
    "os.environ", {
        **helpers.disableAllModels(), "POPLAR_ENGINE_OPTIONS":
        json.dumps({
            "autoReport.directory": ".",
            "autoReport.all": "true",
            "autoReport.outputDebugInfo": "true",
            "autoReport.outputExecutionProfile": "false"
        })
    })
@pytest.mark.mlirSupportRequired
# pylint: disable=import-outside-toplevel
def test_pva_annotations():
    try:
        import pva
    except RuntimeError:
        pytest.skip(
            "Unable to import pva: possibly a Python version mismatch?")

    def findPoptorchLayer(op):
        layer = json.loads(op.layer)["layer"]
        if layer == "poptorch":
            return op
        assert op.parents, "Can't find 'poptorch' layer"
        return findPoptorchLayer(op.parents[0])

    with tempfile.TemporaryDirectory() as tmp:
        os.chdir(tmp)
        subprocess.check_output(
            [
                sys.executable,
                "-u",  # needed to ensure messages are sent to stdout immediately
                "-c",
                f"""
import os, marshal, types;code = marshal.loads({marshal.dumps(process_to_generate_profiling_data.__code__)})
fn = types.FunctionType(code, globals(), "generate_profiling_data")
fn()
        """
            ],
            universal_newlines=True,
            env=os.environ)

        debug = pathlib.Path("debug.cbor").resolve(strict=True)
        profile = pathlib.Path("training", "profile.pop").resolve(strict=True)

        # Read this file and find where the layers were called from to make
        # sure the line numbers are correct inside the profiling information.
        it = helpers.LogIterator(open(__file__, "r").read().split("\n"))
        lines = []
        it.findNext(re.escape("def process_to_generate_profiling_data():"))
        for e in [
                "self.lstm(", "self.softmax(", "self.relu(", "identity_loss("
        ]:
            it.findNext(re.escape(e))
            lines.append(it.lineNumber())

        report = pva.openReport(str(profile), str(debug))
        op_analysis = pva.OperationAnalysis(report)
        for op in op_analysis.operations:
            if not op.name or op.name == "Call":
                continue
            if op.replacedDebugContext:
                ctx = op.replacedDebugContext[0]
            else:
                ctx = op.debugContext
            pop_op = findPoptorchLayer(ctx)
            data = json.loads(pop_op.json)
            op_file = pop_op.location.fileName
            op_line = pop_op.location.lineNumber
            print(f"Name {op.name} {op_file}:{op_line} Debug {data}")

            # All the ops should be associated to this file
            assert os.path.realpath(op_file) == os.path.realpath(__file__)

            assert op.name == data["op_name"]

            # The identity loss is not a layer in the model therefore it won't have a prefix.
            if data["op_type"] in ["Pow", "Identityloss"]:
                assert data["op_name"] == data["op_type"]
            else:
                # All the other ops are stored in the model therefore they'll have prefix
                # "foo/op_type" where "foo" is the name of the attribute in the model.
                assert data["op_name"].endswith("/" + data["op_type"])
            assert data["layer"] == "poptorch"

            if data["op_name"].startswith("block0/lstm"):
                assert op_line == lines[0]
            elif data["op_name"].startswith("block0/softmax"):
                assert op_line == lines[1]
            elif data["op_name"].startswith("relu/"):
                assert op_line == lines[2]
            elif data["op_name"] == data["op_type"]:  # identity_loss(x**2)
                assert op_line == lines[3]
            else:
                raise ValueError("Unexpected op " + data["op_name"])


================================================
FILE: tests/pyg_torch_scatter_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.

# Tests for PyG torch_scatter ops integration with PopTorch
from functools import partial
import torch
import pytest
import helpers
import poptorch

if helpers.is_running_tests:
    from torch_scatter import scatter, scatter_log_softmax, scatter_softmax, scatter_std, scatter_logsumexp, scatter_add, scatter_max, scatter_min, scatter_mul
else:

    def scatter():
        pass

    def scatter_log_softmax():
        pass

    def scatter_softmax():
        pass

    def scatter_std():
        pass

    def scatter_add():
        pass

    def scatter_max():
        pass

    def scatter_min():
        pass

    def scatter_mul():
        pass

    def scatter_logsumexp():
        pass


def torch_scatter_harness(func, src, index, out=None):

    dim_size = int(index.max()) + 1

    class Model(torch.nn.Module):
        def forward(self, src, index, out=None):
            if out is None:
                return func(src, index, dim_size=dim_size)
            return func(src, index, out=out, dim_size=dim_size)

    model = Model()
    poptorch_model = poptorch.inferenceModel(model)

    out_in_plac_native = None

    if out is not None:
        out_in_plac_native = out.clone()
        native_out = func(src,
                          index,
                          out=out_in_plac_native,
                          dim_size=dim_size)
        ipu_out = poptorch_model(src, index, out=out)
    else:
        native_out = func(src, index, dim_size=dim_size)
        ipu_out = poptorch_model(src, index)

    helpers.assert_allclose(actual=ipu_out, expected=native_out)
    if out is not None:
        helpers.assert_allclose(actual=out, expected=out_in_plac_native)

    poptorch_model.destroy()


@pytest.mark.parametrize("reduce", ['sum', 'mean', 'max', 'min', 'mul'])
def test_scatter(reduce):
    func = partial(scatter, reduce=reduce)
    src = torch.tensor([1, 3, 2, 4, 5, 6]).float()
    index = torch.tensor([0, 1, 0, 1, 1, 3]).long()
    torch_scatter_harness(func, src, index)


@pytest.mark.parametrize(
    "func",
    [scatter_log_softmax, scatter_logsumexp, scatter_softmax, scatter_std])
def test_composites(func):
    src = torch.tensor([1, 3, 2, 4, 5, 6]).float()
    index = torch.tensor([0, 1, 0, 1, 5, 3]).long()
    torch_scatter_harness(func, src, index)


@pytest.mark.parametrize("func", [scatter_max, scatter_min, scatter_mul])
def test_scatter_inplace(func):
    src = torch.tensor([1, 3, 2, 4, 5, 6]).float()
    index = torch.tensor([0, 1, 4, 2, 3, 5]).long()
    out = torch.tensor([10, 1, 11, 1, 23, 1]).float()
    torch_scatter_harness(func, src, index, out)


@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("TRACE")
def test_scatter_add_zeros_optimized(capfd):
    src = torch.tensor([1, 3, 2, 4, 5, 6]).float()
    index = torch.tensor([0, 1, 0, 1, 1, 3]).long()
    torch_scatter_harness(scatter_add, src, index)

    it = helpers.LogChecker(capfd).createIterator()
    it.findNext("Removing zeros output to scatter_add")


@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("TRACE")
def test_scatter_add_nd_expand_removed(capfd):
    torch.manual_seed(0)
    src = torch.randn(10, 6, 16)
    index = torch.tensor([0, 1, 0, 1, 1, 3]).long()
    func = partial(scatter_add, dim=1)
    torch_scatter_harness(func, src, index)

    it = helpers.LogChecker(capfd).createIterator()
    it.findNext("Removing index expansion node:")


@pytest.mark.parametrize("shape", [(5, ), (2, 5), (2, 5, 5)])
@pytest.mark.parametrize("func", [scatter_max, scatter_min, scatter_mul])
def test_scatter_overloads(shape, func):
    torch.manual_seed(0)
    x = torch.rand(shape)
    ind = torch.randint(3, shape)

    torch_scatter_harness(func, x, ind)


================================================
FILE: tests/random_sampling_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import torch
import pytest
import helpers
import poptorch


def int_mean(x):
    return torch.mean(x.to(torch.float))


def int_var(x):
    return torch.var(x.to(torch.float))


# Random Number Generation Harness
# Checks that the IPU generated data with roughly the same summary
# statistics as the CPU version.
def rng_harness(rng_op, input, stat_funs, expected_dtype=torch.float):
    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.rng_op = rng_op

        def forward(self, x):
            torch.manual_seed(42)
            x = x + 0  # Ensure input is not modified in place
            return self.rng_op(x)

    model = Model()

    # Run on IPU and check that the result has the correct type
    opts = poptorch.Options().randomSeed(8)
    pop_model = poptorch.inferenceModel(model, opts)
    pop_out = pop_model(input)
    assert pop_out.dtype == expected_dtype

    if expected_dtype is torch.half:
        # Promote CPU model and input
        model = model.float()
        input = input.float()
        # promote IPU result to allow summary stat comparison
        pop_out = pop_out.float()

    native_out = model(input)
    assert native_out.size() == pop_out.size()

    # PRNG depends on HW implementation so we just check
    # that the distribution statistics are consistent
    print("Checking summary statistics for generated random numbers:")
    for ss in stat_funs:
        print("  {} = poptorch {}, native {}".format(ss.__name__, ss(pop_out),
                                                     ss(native_out)),
              flush=True)
        helpers.assert_allclose(expected=ss(native_out),
                                actual=ss(pop_out),
                                atol=1e-2,
                                rtol=0.1)


# torch.rand
@pytest.mark.ipuHardwareRequired
def test_rand():
    def rng_op(x):
        return torch.rand(x.size())

    stat_funs = [torch.min, torch.max, torch.mean, torch.var]
    input = torch.empty(size=(3, 5, 100))
    rng_harness(rng_op, input, stat_funs)


# torch.distributions.Uniform
@pytest.mark.ipuHardwareRequired
def test_distributions_uniform():
    def rng_op(x):
        ud = torch.distributions.Uniform(0.0, 10.0)
        return ud.sample(x.size())

    sample_like = torch.empty(10, 10, 1000)
    stat_funs = [torch.min, torch.max, torch.mean, torch.var]
    rng_harness(rng_op, sample_like, stat_funs)


# torch.uniform_
@pytest.mark.ipuHardwareRequired
@pytest.mark.parametrize("dt", [torch.float, torch.half])
def test_uniform_(dt):
    def rng_op(x):
        return x.uniform_()

    input = torch.empty(size=(3, 4, 1000), dtype=dt)
    stat_funs = [torch.min, torch.max, torch.mean, torch.var]
    rng_harness(rng_op, input, stat_funs, expected_dtype=dt)


# torch.normal
@pytest.mark.ipuHardwareRequired
def test_normal():
    def rng_op(x):
        return torch.normal(mean=0.0, std=1.0, size=x.size())

    input = torch.empty(6, 10, 1000)
    stat_funs = [torch.mean, torch.var]
    rng_harness(rng_op, input, stat_funs)


# torch.normal_
@pytest.mark.ipuHardwareRequired
@pytest.mark.parametrize("dt", [torch.float, torch.half])
def test_normal_(dt):
    def rng_op(x):
        return x.normal_(mean=1.0, std=2.0)

    input = torch.empty(size=(3, 5, 1000), dtype=dt)
    stat_funs = [torch.mean, torch.var]
    rng_harness(rng_op, input, stat_funs, expected_dtype=dt)


# torch.normal with buffers and params
@pytest.mark.ipuHardwareRequired
def test_normal_buffers():
    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.register_buffer("mean", torch.Tensor([1.0, 2.0, 3.0]))
            self.register_parameter(
                "std", torch.nn.Parameter(torch.Tensor([0.5, 1.0, 1.5])))

        def forward(self, x):
            torch.manual_seed(42)
            return torch.normal(self.mean, 0.5) + torch.normal(1.0,
                                                               self.std) + x

    model = Model()

    # Run on IPU and check that the result has the correct type
    opts = poptorch.Options().randomSeed(8)
    pop_model = poptorch.inferenceModel(model, opts)
    pop_out = pop_model(torch.tensor([0.0, 0.0, 0.0]))
    assert pop_out.dtype == torch.float

    native_out = model(torch.tensor([0.0, 0.0, 0.0]))
    assert native_out.size() == pop_out.size()


# torch.distributions.Normal
# The sample method uses torch.normal(Tensor mean, Tensor std)
@pytest.mark.ipuHardwareRequired
def test_distributions_normal():
    def rng_op(x):
        h = torch.tensor([234.0, 100.0])
        nd = torch.distributions.Normal(loc=h, scale=torch.sqrt(h))
        return nd.sample(x.size())

    input = torch.empty(10000, 5)
    mean = lambda x: torch.mean(x, dim=[0, 1])
    mean.__name__ = "torch.mean(x, dim=[0, 1])"

    std = lambda x: torch.std(x, dim=[0, 1])
    std.__name__ = "torch.std(x, dim=[0, 1])"

    stat_funs = [mean, std]
    rng_harness(rng_op, input, stat_funs)


# torch.randn
@pytest.mark.ipuHardwareRequired
def test_randn():
    def rng_op(x):
        return torch.randn(x.size())

    input = torch.empty(3, 5, 10000)
    stat_funs = [torch.mean, torch.var]
    rng_harness(rng_op, input, stat_funs)


# torch.random_
@pytest.mark.ipuHardwareRequired
@pytest.mark.parametrize("input", [
    torch.empty(3, 5, 10000, dtype=torch.float),
    torch.empty(3, 5, 10000, dtype=torch.int),
])
def test_random(input):
    def rng_op(x):
        return x.random_(5, 100)

    stat_funs = [torch.min, torch.max, int_mean, int_var]
    rng_harness(rng_op, input, stat_funs, input.dtype)


# torch.randint
@pytest.mark.ipuHardwareRequired
@pytest.mark.parametrize("dtype", [None, torch.int32, torch.half, torch.float])
def test_randint(dtype):
    def rng_op(x):
        return torch.randint(5, 100, x.size(), dtype=dtype)

    input = torch.empty(3, 5, 10000)
    stat_funs = [torch.min, torch.max, int_mean, int_var]
    rng_harness(rng_op, input, stat_funs,
                torch.int32 if dtype is None else dtype)


# torch.normal(Tensor mean, float std)
@pytest.mark.ipuHardwareRequired
def test_normal_tensor_mean():
    def rng_op(x):
        return torch.normal(mean=x, std=3.0)

    mean = torch.full(size=(10000, 2), fill_value=4.0)
    stat_funs = [torch.mean, torch.std]
    rng_harness(rng_op, mean, stat_funs)


# torch.normal(float mean, Tensor std)
@pytest.mark.ipuHardwareRequired
def test_normal_tensor_std():
    def rng_op(x):
        return torch.normal(mean=3.0, std=x)

    std = torch.full(size=(10000, 2), fill_value=9.0)
    stat_funs = [torch.mean, torch.std]
    rng_harness(rng_op, std, stat_funs)


# torch.bernoulli - test with both float and half types
@pytest.mark.ipuHardwareRequired
@pytest.mark.parametrize("t", [torch.float, torch.half])
def test_bernoulli(t):
    prob = torch.full(size=(3, 5, 100), dtype=t, fill_value=0.5)
    stat_funs = [torch.min, torch.max, torch.mean]
    rng_harness(torch.bernoulli, prob, stat_funs, expected_dtype=t)


# torch.bernoulli - check expected output for probability limits.
@pytest.mark.ipuHardwareRequired
@pytest.mark.parametrize("p", [0.0, 1.0])
def test_bernoulli_limits(p):
    prob = torch.full(size=(3, 5, 1000), fill_value=p)
    func = lambda x: torch.all(x == p)
    func.__name__ = f"torch.all(x == {p})"
    rng_harness(torch.bernoulli, prob, [func])


# torch.bernoulli_
@pytest.mark.ipuHardwareRequired
def test_bernoulli_():
    def rng_op(x):
        return x.bernoulli_(p=0.3)

    input = torch.empty(3, 5, 100)
    stat_funs = [torch.min, torch.max, torch.mean]
    rng_harness(rng_op, input, stat_funs)


# torch.distributions.Bernoulli
@pytest.mark.ipuHardwareRequired
def test_distributions_bernoulli():
    def rng_op(x):
        bd = torch.distributions.Bernoulli(0.5)
        return bd.sample(x.size())

    input = torch.empty(10, 10, 1000)
    stat_funs = [torch.min, torch.max, torch.mean]
    rng_harness(rng_op, input, stat_funs)


# torch.exponential_
@pytest.mark.ipuHardwareRequired
@pytest.mark.parametrize("lambd", [0.5, 1.0])
def test_exponential_(lambd):
    def rng_op(x):
        return x.exponential_(lambd=lambd)

    input = torch.empty(3, 5, 100)
    stat_funs = [torch.mean]
    rng_harness(rng_op, input, stat_funs)


# torch.distributions.Exponential
@pytest.mark.ipuHardwareRequired
def test_distributions_exponential():
    def rng_op(x):
        bd = torch.distributions.Exponential(0.5)
        return bd.sample(x.size())

    input = torch.empty(10, 10, 1000)
    stat_funs = [torch.mean]
    rng_harness(rng_op, input, stat_funs)


@pytest.mark.ipuHardwareRequired
def test_randperm():
    def rng_op(x):
        return torch.randperm(x.size(dim=0)) + 0

    input = torch.arange(100)
    stat_funs = [torch.numel]
    rng_harness(rng_op, input, stat_funs, torch.int32)


@pytest.mark.ipuHardwareRequired
def test_random_seed_repeatability():
    class Model(torch.nn.Module):
        def forward(self, x):
            x = x + 0  # Ensure input is not modified in place
            return x.normal_()

    # Run the model once with a random seed
    model = Model()
    opts = poptorch.Options().randomSeed(42)
    first_model = poptorch.inferenceModel(model, opts)
    first_run = first_model(torch.empty((2, 2)))

    # Second run with the same seed should produce identical results
    second_model = poptorch.inferenceModel(model, opts)
    second_run = second_model(torch.empty((2, 2)))
    helpers.assert_allequal(expected=first_run, actual=second_run)


================================================
FILE: tests/reduce_ops_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import torch
import pytest
import numpy as np
import helpers
import poptorch


# Reduce Ops Harness
# Checks that the IPU reduce ops match the CPU version.
def reduce_harness(func, input, **kwargs):
    # pass any reduce op kwargs only if they're set to
    # avoid named tensor errors
    op_kwargs = {name: val for name, val in kwargs.items() if val is not None}

    def reduce_op(x):
        return func(x, **op_kwargs)

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.reduce_op = reduce_op

        def forward(self, x):
            # Ensure input is not modified in place
            x = x + 0
            return self.reduce_op(x)

    model = Model()

    # Run on IPU and check that the result has the correct type
    pop_model = poptorch.inferenceModel(model)
    pop_out = pop_model(input)
    native_out = model(input)

    check_dtype = "dtype" in kwargs
    if torch.is_floating_point(native_out):
        helpers.assert_allclose(expected=native_out,
                                actual=pop_out,
                                check_dtype=check_dtype)
    else:
        helpers.assert_allequal(expected=native_out,
                                actual=pop_out,
                                check_dtype=check_dtype)


# torch.all, torch.any
@pytest.mark.parametrize("dim", [None, 0, -1])
@pytest.mark.parametrize("func", [torch.all, torch.any])
def test_any_all(func, dim):
    input = torch.randint(low=0, high=3, size=(32, 128))
    reduce_harness(func, input, dim=dim)


@pytest.mark.parametrize("dim", [None, 0, -1])
@pytest.mark.parametrize("func", [torch.sum, torch.mean])
def test_sum_mean(func, dim):
    input = torch.rand(32, 128)
    reduce_harness(func, input, dim=dim)


@pytest.mark.parametrize("dim", (None, 0, -1, [1, 2]))
def test_count_nonzero(dim):
    torch.manual_seed(42)
    input = torch.randint(10, (2, 3, 4, 5))
    reduce_harness(torch.count_nonzero, input, dim=dim)


@pytest.mark.parametrize("dim", (None, 0, -1, [0, 1]))
@pytest.mark.parametrize("keepdim", [True, False])
@pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
def test_nansum(dim, keepdim, dtype):
    np.random.seed(0)
    # Create a tensor that contains some nans - be careful in the
    # torch.float16 case to not overflow the float16 range
    shape = (10, 10)
    mask = np.random.randint(0, 2, size=shape).astype(bool)
    data = np.random.rand(*shape).astype(np.float32)
    data[mask] = np.nan

    input = torch.from_numpy(data)
    reduce_harness(
                   torch.nansum,
                   input,
                   dim=dim,
                   keepdim=keepdim,
                   dtype=dtype)


================================================
FILE: tests/replicated_graph_test.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import torch
import pytest
import numpy as np
import helpers
import poptorch


@pytest.mark.ipuHardwareRequired
def test_weight_update_replicas(process_id=0, num_processes=1):
    localReplicationFactor = 2

    opts = poptorch.Options()
    opts.replicationFactor(localReplicationFactor)
    opts.Distributed.configureProcessId(process_id, num_processes)

    replicationFactor = localReplicationFactor * opts.Distributed.numProcesses

    np.random.seed(42)

    A = np.random.rand(2, 4).astype(np.float32)
    B = np.ones((4, 6)).astype(np.float32)
    C = np.random.rand(2, 6).astype(np.float32)

    alpha = np.random.random(1).astype(np.float32)[0]
    beta = np.random.random(1).astype(np.float32)[0]

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()

            self.b = torch.tensor(B, requires_grad=True)
            self.c = torch.tensor(C, requires_grad=True)

            # Create the weight tensors for pytorch
            self.B = torch.nn.Parameter(self.b, requires_grad=True)
            self.C = torch.nn.Parameter(self.c, requires_grad=True)

            self.matmul = torch.matmul

            self.loss = torch.nn.L1Loss(reduction="mean")

        def forward(self, input, target):
            # Perform the GEMM operation
            x = alpha * self.matmul(input, self.B) + beta * self.C
            loss = self.loss(x, target)
            return x, loss

    def reference():
        module = Model()
        module.train()

        optimizer = torch.optim.SGD(module.parameters(),
                                    lr=0.01,
                                    weight_decay=0.0,
                                    momentum=0.0)

        a = torch.tensor(A, requires_grad=True)
        optimizer.zero_grad()

        outputs = ()

        # graph with gradient accumlation i.e. only update the weights after x passes
        for _ in range(replicationFactor):
            target = torch.zeros(C.shape)
            out, loss = module(a, target)
            outputs = outputs + (out, )
            loss.backward()

        # Update the weights
        optimizer.step()

        # Only keep the output slice corresponding to this process
        outputs = outputs[opts.Distributed.processId *
                          localReplicationFactor:][:localReplicationFactor]
        return [torch.cat(outputs), module.B.data, module.C.data]

    model = Model()
    poptorch_model = poptorch.trainingModel(model,
                                            options=opts,
                                            optimizer=torch.optim.SGD(
                                                model.parameters(),
                                                lr=0.01,
                                                weight_decay=0.0,
                                                momentum=0.0))

    ref_out = reference()
    ipu_A = np.concatenate([A for _ in range(localReplicationFactor)])

    target = torch.zeros(2 * localReplicationFactor, 6)
    output, _ = poptorch_model(torch.tensor(ipu_A, requires_grad=True), target)
    out = [output, model.B.data, model.C.data]
    for idx, ref in enumerate(ref_out):
        print("Validating output %d" % idx)
        helpers.assert_allclose(actual=out[idx],
                                expected=ref,
                                rtol=1e-03,
                                atol=1e-03)


@pytest.mark.ipuHardwareRequired
def test_too_many_ipus():
    localReplicationFactor = 128

    opts = poptorch.Options()
    opts.replicationFactor(localReplicationFactor)

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.layer = torch.nn.Linear(128, 4)
            self.loss = torch.nn.L1Loss(reduction="mean")

        def forward(self, input, target):
            out = self.layer(input)
            loss = self.loss(out, target)
            return out, loss

    model = Model()

    poptorch_model = poptorch.trainingModel(model,
                                            options=opts,
                                            optimizer=torch.optim.SGD(
                                                model.parameters(),
                                                lr=0.01,
                                                weight_decay=0.0,
                                                momentum=0.0))

    np.random.seed(42)
    input = np.random.rand(512, 128).astype(np.float32)
    labels = np.ones((128, 4)).astype(np.float32)

    with pytest.raises(
            poptorch.Error,
            match=r"Too many IPUs requested \(128\)\. Experiments that need .*"
    ):
        poptorch_model(torch.tensor(input, requires_grad=True),
                       torch.tensor(labels))


class ModelWithLoss(torch.nn.Module):
    def __init__(self, W_init):
        super().__init__()
        self.W = torch.nn.Parameter(W_init)

    def forward(self, X):
        Z = X @ self.W
        return Z, poptorch.identity_loss(Z**2, reduction="mean")


@pytest.mark.ipuHardwareRequired
@pytest.mark.parametrize("orthogonalInput", [True, False])
def test_per_replica_variables(orthogonalInput):
    # Split the weight tensor into 4, and the input data tensor into 2.
    tensor_shards = 4
    data_shards = 2

    # Set up the problem
    random = np.random.RandomState(seed=100)
    prob_X = random.normal(size=(24, 40)).astype(np.float32)
    prob_W_init = random.normal(size=(40, 56)).astype(
        np.float32) * (5 * 8)**-0.5
    prob_steps = 4

    # Run on the CPU
    X = torch.tensor(prob_X)
    W = torch.nn.Parameter(torch.tensor(prob_W_init))
    optim = torch.optim.SGD([W], lr=0.01)

    cpu_losses = []
    for _ in range(prob_steps):
        optim.zero_grad()
        v = (X @ W)**2
        loss = torch.mean(v)
        loss.backward()
        optim.step()
        cpu_losses.append(loss.detach())
    cpu_losses = np.array(cpu_losses)
    cpu_W_final = W.detach().numpy()

    # Run on 8 IPUs
    W_init = torch.tensor(
        prob_W_init.reshape(prob_W_init.shape[0], tensor_shards,
                            prob_W_init.shape[1] // tensor_shards).transpose(
                                1, 0, 2)).contiguous()
    m = ModelWithLoss(W_init)
    optim = torch.optim.SGD(m.parameters(), lr=0.01)

    inputGroupType = poptorch.enums.CommGroupType.Consecutive
    weightGroupType = poptorch.enums.CommGroupType.Orthogonal
    if orthogonalInput:
        inputGroupType, weightGroupType = weightGroupType, inputGroupType
    pt_opts = poptorch.Options()
    pt_opts.replicationFactor(data_shards * tensor_shards)
    pt_opts.inputReplicaGrouping(tensor_shards, inputGroupType)
    pt_opts.outputMode(poptorch.OutputMode.All)
    pt_m = poptorch.trainingModel(m, optimizer=optim, options=pt_opts)

    pt_m.W.replicaGrouping(weightGroupType, data_shards,
                           poptorch.enums.VariableRetrievalMode.OnePerGroup)
    pt_losses = []
    if data_shards > 1:
        X = X.reshape(data_shards, X.shape[0] // data_shards, *X.shape[1:])
    for _ in range(prob_steps):
        _, loss = pt_m(X)
        # We divide by the number of replicas because the mean is being
        # taken only over a part of the tensor on each replica, so we need to
        # divide by the number of replicas to get the correct mean.
        pt_losses.append(
            torch.sum(loss.detach()) / (data_shards * tensor_shards))
    pt_losses = np.array(pt_losses)
    pt_W_final = m.W.detach().numpy().transpose(1, 0, 2) \
                  .reshape(prob_W_init.shape)
    np.testing.assert_allclose(cpu_losses, pt_losses, atol=1e-6)
    np.testing.assert_allclose(cpu_W_final, pt_W_final, atol=1e-6)


@pytest.mark.ipuHardwareRequired
def test_per_replica_variables_no_grouping():
    shape0 = 4
    shape1 = shape0 + 2

    model = ModelWithLoss(torch.randn(shape0, shape1))

    optimizer = poptorch.optim.AdamW(model.parameters(), 0.1)

    options = poptorch.Options()
    options.replication_factor = shape0

    training_model = poptorch.trainingModel(model,
                                            options,
                                            optimizer=optimizer)

    training_model.W.replicaGrouping(
        poptorch.CommGroupType.NoGrouping, 0,
        poptorch.VariableRetrievalMode.OnePerGroup)

    training_model(torch.randn(shape0, shape1))

    for _, v in optimizer.state_dict()["ipu_state"].items():
        assert v.shape[0] == shape0

    training_model.destroy()


================================================
FILE: tests/requires_grad_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.

import torch
import helpers
import poptorch


def test_requires_grad_false_simple():
    torch.manual_seed(42)

    class Model(torch.nn.Module):
        def __init__(self, a, b, c, d):
            super().__init__()
            self.a = torch.nn.Parameter(a)
            self.b = torch.nn.Parameter(b)
            self.c = torch.nn.Parameter(c, requires_grad=False)
            self.d = torch.nn.Parameter(d, requires_grad=False)
            self.loss = torch.nn.MSELoss()

        def forward(self, target):
            s0 = self.a + self.b
            s1 = self.c + self.d
            return self.loss(s0 + s1, target)

    # Ends up with requires_grad=True.
    a = torch.randn(5)
    b = torch.randn(5)
    # Ends up with requires_grad=False.
    c = torch.randn(5)
    d = torch.randn(5)
    target = torch.randn(5)

    model = Model(a.clone(), b.clone(), c.clone(), d.clone())
    native_out = model(target)

    poptorch_model = poptorch.trainingModel(model)
    poptorch_out = poptorch_model(target)
    helpers.assert_allclose(actual=poptorch_out, expected=native_out)

    for _ in range(100):
        poptorch_out = poptorch_model(target)
        assert not torch.allclose(poptorch_out, native_out)
        # 'a' and 'b' are updated
        assert not torch.allclose(poptorch_model.a.data, a)
        assert not torch.allclose(poptorch_model.b.data, b)
        # 'c' and 'd' are not updated
        helpers.assert_allclose(actual=poptorch_model.c.data, expected=c)
        helpers.assert_allclose(actual=poptorch_model.d.data, expected=d)


def test_requires_grad_false_on_single_input():
    torch.manual_seed(42)

    class Model(torch.nn.Module):
        def __init__(self, a, b):
            super().__init__()
            self.a = torch.nn.Parameter(a)
            self.b = torch.nn.Parameter(b, requires_grad=False)
            self.loss = torch.nn.MSELoss()

        def forward(self, target):
            s = self.a + self.b
            return self.loss(s, target)

    # Ends up with requires_grad=True.
    a = torch.randn(5)
    # Ends up with requires_grad=False.
    b = torch.randn(5)
    target = torch.randn(5)

    model = Model(a.clone(), b.clone())
    native_out = model(target)

    poptorch_model = poptorch.trainingModel(model)
    poptorch_out = poptorch_model(target)
    helpers.assert_allclose(actual=poptorch_out, expected=native_out)

    for _ in range(100):
        poptorch_out = poptorch_model(target)
        assert not torch.allclose(poptorch_out, native_out)
        # 'a' is updated
        assert not torch.allclose(poptorch_model.a.data, a)
        # 'b' is not updated
        helpers.assert_allclose(actual=poptorch_model.b.data, expected=b)


================================================
FILE: tests/rnn_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import pytest
import torch
import torch.nn as nn
import helpers
import poptorch


@pytest.mark.parametrize("nonlinearity", ['tanh', 'relu'])
@pytest.mark.parametrize("batch_first", [True, False])
def test_rnn(nonlinearity, batch_first):
    torch.manual_seed(42)
    num_batches = 10
    sequence_length = 5
    batch_size = 8
    input_size = 4
    hidden_size = 3
    num_layers = 1

    if batch_first:
        input_shape = (batch_size, sequence_length, input_size)
    else:
        input_shape = (sequence_length, batch_size, input_size)

    inputs = [torch.randn(input_shape) for _ in range(num_batches)]
    h = torch.randn((num_layers, batch_size, hidden_size))

    rnn = nn.RNN(
        input_size,
        hidden_size,
        num_layers,
        nonlinearity=nonlinearity,
        batch_first=batch_first,
    )
    model = helpers.ModelWithWeights(rnn, inputs[0].shape, lambda x: x[0])
    ipu_model = poptorch.trainingModel(model)

    for input in inputs:
        (out_cpu, h_cpu), _ = model((input, h))
        (out_ipu, h_ipu), _ = ipu_model((input, h))
        helpers.assert_allclose(actual=out_ipu, expected=out_cpu)
        helpers.assert_allclose(actual=h_ipu, expected=h_cpu)
        ipu_model.assert_weights_changed()
        h = h_cpu


================================================
FILE: tests/sharding_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.
import torch
import helpers
import poptorch


def test_sharded_execution():
    class Model(torch.nn.Module):
        def forward(self, x):
            with poptorch.Block("0", ipu_id=0):
                x = x * 2
            with poptorch.Block("1", ipu_id=1):
                x = x * 3
            with poptorch.Block("2", ipu_id=2):
                x = x * 4
            with poptorch.Block("3", ipu_id=3):
                x = x * 5
            return x

    native = Model()
    stages = [poptorch.Stage(f"{k}") for k in range(0, 4)]
    strategy = poptorch.ShardedExecution(*stages)

    opts = poptorch.Options()
    opts.setExecutionStrategy(strategy)
    ipu = poptorch.inferenceModel(native, opts)

    torch.manual_seed(42)
    inp = torch.randn(3, 7)

    native_out = native(inp)
    ipu_out = ipu(inp)
    helpers.assert_allclose(actual=ipu_out, expected=native_out)


================================================
FILE: tests/slice_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import copy

import pytest
import torch

import helpers
import poptorch


def slice_test_harness(tensor_x, tensor_y, start_fn, end_fn, step):
    op = lambda x, y: x[start_fn(x):end_fn(x):step] + y

    model = helpers.ModelWithWeights(op, tensor_x.shape)

    # Run on CPU.
    native_out, _ = model((tensor_x, tensor_y))

    # Run on IPU.
    options = poptorch.Options()
    poptorch_model = poptorch.trainingModel(model, options=options)
    poptorch_out, _ = poptorch_model((tensor_x, tensor_y))

    # Inference test - check outputs
    helpers.assert_allclose(expected=native_out, actual=poptorch_out)

    # Training test - check weights changed
    poptorch_model.assert_weights_changed()


@pytest.mark.parametrize("step", [1, 2, 3])
def test_slice_idx_size_of(step):
    def start_fn(tensor_in):
        return tensor_in.shape[0] // 2

    def end_fn(tensor_in):
        return tensor_in.shape[0] - 1

    slice_test_harness(torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]),
                       torch.tensor([3.0]), start_fn, end_fn, step)


def dynamic_slice_harness(tensor_in,
                          extra_in,
                          start_fn,
                          end_fn,
                          step,
                          test_training=False):
    # TODO(T62094) PopART doesn't currently support dynamic slices in training.
    # Once it works, switch back test_training to True by default.
    options = poptorch.Options()
    if test_training:
        size = end_fn(1) - start_fn(1)
        op = lambda t, e: poptorch.dynamic_slice(t, 0, start_fn(e), size, step)
        model = helpers.ModelWithWeights(op, tensor_in.shape)

        # Run on CPU.
        native_out, _ = model((tensor_in, extra_in))

        # Run on IPU.
        poptorch_model = poptorch.trainingModel(model, options)
        poptorch_out, _ = poptorch_model((tensor_in, extra_in))

        # Training test - check weights changed
        poptorch_model.assert_weights_changed()
    else:
        model = torch.nn.Module()
        size = (end_fn(torch.tensor([1], dtype=torch.int)) -
                start_fn(torch.tensor([1], dtype=torch.int))).item()
        model.forward = lambda t, e: poptorch.dynamic_slice(
            t, 0, start_fn(e), size, step)

        # Run on CPU.
        native_out = model(tensor_in, extra_in)

        # Run on IPU.
        poptorch_model = poptorch.inferenceModel(model, options)
        # Make sure the model is compiled using different tensor values
        # otherwise there is no way to tell if the values are compiled
        # in the executable or truly dynamic.
        poptorch_model.compile(
            torch.randn_like(tensor_in),  # Use a random input
            extra_in + torch.tensor([20])  # Offset extra_in
        )
        poptorch_out = poptorch_model(tensor_in, extra_in)

    helpers.assert_allclose(expected=native_out, actual=poptorch_out)


@pytest.mark.parametrize("step", [1, 2, 3])
def test_dynamic_slice_one_dim_add(step):
    def start_fn(extra_in):
        return extra_in

    def end_fn(extra_in):
        return extra_in + 4

    dynamic_slice_harness(
        torch.tensor([2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]),
        torch.tensor([1]), start_fn, end_fn, step)


@pytest.mark.parametrize("step", [1, 2, 3])
def test_dynamic_slice_one_dim_subtract(step):
    def start_fn(extra_in):
        return extra_in - 4

    def end_fn(extra_in):
        return extra_in

    dynamic_slice_harness(
        torch.tensor([2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]),
        torch.tensor([5]), start_fn, end_fn, step)


@pytest.mark.parametrize("step", [1, 2, 3])
def test_dynamic_slice_one_dim_mix_up(step):
    def start_fn(extra_in):
        tmp = extra_in + 3
        tmp = tmp - 10
        tmp = tmp + 3

        return tmp

    def end_fn(extra_in):
        tmp = extra_in - 6
        tmp = tmp + 4
        return tmp

    dynamic_slice_harness(
        torch.tensor([2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]),
        torch.tensor([5]), start_fn, end_fn, step)


@pytest.mark.parametrize("step", [1, 2, 3])
def test_dynamic_slice_two_dims(step):
    def start_fn(extra_in):
        return extra_in.to(torch.int32)

    def end_fn(extra_in):
        return extra_in.to(torch.int32) + 1

    dynamic_slice_harness(
        torch.tensor([[2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
                      [8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0]]),
        torch.tensor([0]), start_fn, end_fn, step)


@pytest.mark.parametrize("step", [1, 2, 3])
def test_dynamic_slice_two_dims_twice_sliced(step):
    start_dim_one = torch.tensor([1])
    start_dim_two = torch.tensor([0])

    op = lambda t: t[start_dim_one:start_dim_one + 2:step, start_dim_two:
                     start_dim_two + 4:step]

    tensor_in = torch.tensor([[2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
                              [8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0],
                              [2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
                              [8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0]])

    model = helpers.ModelWithWeights(op, tensor_in.shape)

    # Run on CPU.
    native_out, _ = model((tensor_in, ))

    # Run on IPU.
    options = poptorch.Options()
    poptorch_model = poptorch.trainingModel(model, options=options)
    poptorch_out, _ = poptorch_model((tensor_in, ))

    # Inference test - check outputs
    helpers.assert_allclose(expected=native_out, actual=poptorch_out)

    # Training test - check weights changed
    poptorch_model.assert_weights_changed()


def test_dynamic_slice_one_dim_equal():
    def start_fn(extra_in):
        return extra_in

    def end_fn(extra_in):
        return extra_in

    error_msg = r"The start and end of a slice must be different."

    with pytest.raises(poptorch.Error, match=error_msg):
        # Set test_training=False because we expect inference to fail
        dynamic_slice_harness(torch.tensor(
            [2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]),
                              torch.tensor([5]),
                              start_fn,
                              end_fn,
                              1,
                              test_training=False)


def test_dynamic_slice_one_dim_less_than():
    def start_fn(extra_in):
        return extra_in

    def end_fn(extra_in):
        return extra_in - 2

    error_msg = (r"Taking a slice of a tensor with the end less than the "
                 r"start is not supported.")

    with pytest.raises(poptorch.Error, match=error_msg):
        # Set test_training=False because we expect inference to fail
        dynamic_slice_harness(torch.tensor(
            [2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]),
                              torch.tensor([5]),
                              start_fn,
                              end_fn,
                              2,
                              test_training=False)


def test_dynamic_slice_one_dim_add_non_factor():
    def start_fn(extra_in):
        return extra_in

    def end_fn(extra_in):
        return extra_in + 7

    error_msg = (r"The size of the slice \(7\) must be a factor of the "
                 r"slicing dimension \(8\)\.")

    with pytest.raises(poptorch.Error, match=error_msg):
        # Set test_training=False because we expect inference to fail
        dynamic_slice_harness(torch.tensor(
            [2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]),
                              torch.tensor([1]),
                              start_fn,
                              end_fn,
                              1,
                              test_training=False)


@pytest.mark.parametrize("dim", [0, 1, 2])
@pytest.mark.parametrize("use_half", [True, False])
def test_unbind(dim, use_half):
    if use_half:
        # Test correct implicit casting
        def op(x):
            unbound = torch.unbind(x, dim)
            return unbound[0] + 2.0, unbound[1]
    else:
        op = lambda x: torch.unbind(x, dim)

    x = torch.randn(2, 3, 4)

    model = helpers.ModelWithWeights(op, x.shape, out_fn=lambda x: x[0])

    # Unfortunately not all forms of matmul are supported for torch.half on the
    # CPU (including 1-dim input, 2-dim weights), so we can only run the IPU
    # model with halves.
    poptorch_model = copy.deepcopy(model)
    if use_half:
        poptorch_model.half()
        # pylint: disable=protected-access
        poptorch_model._weights_before = poptorch_model.lin.weight.detach(
        ).clone()

    options = poptorch.Options()
    poptorch_model = poptorch.trainingModel(poptorch_model, options=options)

    native_out, _ = model((x, ))
    poptorch_out, _ = poptorch_model((x.half() if use_half else x, ))

    # Check the unbound dim length is the same
    assert len(native_out) == len(poptorch_out)

    # Inference test - check outputs
    for tensor_native, tensor_pop in zip(native_out, poptorch_out):
        if use_half:
            tensor_native = tensor_native.half()
        helpers.assert_allclose(expected=tensor_native,
                                actual=tensor_pop,
                                atol=0.01,
                                rtol=0.01)

    # Training test - check weights changed
    poptorch_model.assert_weights_changed()


def test_scalarslice():
    class Model(torch.nn.Module):
        def forward(self, x):
            return (x / 2)[:]

    model = Model()
    options = poptorch.Options()
    poptorch_model = poptorch.inferenceModel(model, options)

    input_tensor = torch.tensor([2])
    assert poptorch_model(input_tensor) == model(input_tensor)


def test_select_negative_dim():
    class Model(torch.nn.Module):
        def forward(self, x):
            return x.select(-1, 1)

    model = Model()
    options = poptorch.Options()
    poptorch_model = poptorch.inferenceModel(model, options)

    input_tensor = torch.rand((2, 4))
    helpers.assert_allequal(actual=poptorch_model(input_tensor),
                            expected=model(input_tensor))


def test_slice_negative_dim():
    class Model(torch.nn.Module):
        def forward(self, x):
            # This lowers to aten::select with a negative dim, which is what
            # we want to test in the JIT dispatcher
            return x.narrow(-1, 0, 2)

    model = Model()
    options = poptorch.Options()
    poptorch_model = poptorch.inferenceModel(model, options)

    input_tensor = torch.rand((2, 4))
    helpers.assert_allequal(actual=poptorch_model(input_tensor),
                            expected=model(input_tensor))


def dynamic_update_harness(tensor_in,
                           src_in,
                           extra_in,
                           start_fn,
                           end_fn,
                           dim=0,
                           test_training=False):
    # TODO(T62094) PopART doesn't currently support dynamic slices in training.
    # Once it works, switch back test_training to True by default.
    options = poptorch.Options()
    if test_training:
        size = end_fn(1) - start_fn(1)
        op = lambda t, s, e: poptorch.dynamic_update(t, s, dim, start_fn(e),
                                                     size)
        model = helpers.ModelWithWeights(op, tensor_in.shape)

        # Run on IPU.
        poptorch_model = poptorch.trainingModel(model, options)
        poptorch_out, _ = poptorch_model((tensor_in, src_in, extra_in))

        # Run on CPU.
        native_out, _ = model((tensor_in, src_in, extra_in))

        # Training test - check weights changed
        poptorch_model.assert_weights_changed()
    else:
        model = torch.nn.Module()
        size = (end_fn(torch.tensor([1], dtype=torch.int)) -
                start_fn(torch.tensor([1], dtype=torch.int))).item()
        model.forward = lambda t, s, e: poptorch.dynamic_update(
            t, s, dim, start_fn(e), size)

        # Run on IPU.
        poptorch_model = poptorch.inferenceModel(model, options)
        # Make sure the model is compiled using different tensor values
        # otherwise there is no way to tell if the values are compiled
        # in the executable or truly dynamic.
        poptorch_model.compile(
            torch.randn_like(tensor_in),  # Use a random input
            torch.randn_like(src_in),  # Use random source values
            extra_in + torch.tensor([20])  # Offset extra_in
        )
        poptorch_out = poptorch_model(tensor_in, src_in, extra_in)

        # Run on CPU.
        native_out = model(tensor_in, src_in, extra_in)

    helpers.assert_allclose(expected=native_out, actual=poptorch_out)


def test_dynamic_update_single_update():
    def start_fn(extra_in):
        return extra_in

    def end_fn(extra_in):
        return extra_in + 1

    dynamic_update_harness(
        torch.tensor([2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]),
        torch.tensor([-1.0]), torch.tensor([1]), start_fn, end_fn)


def test_dynamic_update_one_dim_add():
    def start_fn(extra_in):
        return extra_in

    def end_fn(extra_in):
        return extra_in + 4

    dynamic_update_harness(
        torch.tensor([2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]),
        torch.tensor([-1.0, -1.0, -1.0, -1.0]), torch.tensor([1]), start_fn,
        end_fn)


def test_dynamic_update_one_dim_subtract():
    def start_fn(extra_in):
        return extra_in - 4

    def end_fn(extra_in):
        return extra_in

    dynamic_update_harness(
        torch.tensor([2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]),
        torch.tensor([-1.0, -1.0, -1.0, -1.0]), torch.tensor([5]), start_fn,
        end_fn)


def test_dynamic_update_one_dim_equal():
    def start_fn(extra_in):
        return extra_in

    def end_fn(extra_in):
        return extra_in

    error_msg = r"The start and end of a slice must be different"

    with pytest.raises(poptorch.Error, match=error_msg):
        dynamic_update_harness(
            torch.tensor([2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]),
            torch.tensor([-1.0]), torch.tensor([1]), start_fn, end_fn)


def test_dynamic_update_one_dim_add_non_factor():
    def start_fn(extra_in):
        return extra_in

    def end_fn(extra_in):
        return extra_in + 2

    # Set test_training=False because we expect inference to fail
    dynamic_update_harness(torch.tensor([2.0, 2.0, 3.0]),
                           torch.tensor([-1.0, -1.0]),
                           torch.tensor([1]),
                           start_fn,
                           end_fn,
                           test_training=False)


def test_dynamic_update_one_dim_less_than():
    def start_fn(extra_in):
        return extra_in

    def end_fn(extra_in):
        return extra_in - 2

    error_msg = (r"Taking a slice of a tensor with the end less than the "
                 r"start is not supported.")

    with pytest.raises(poptorch.Error, match=error_msg):
        # Set test_training=False because we expect inference to fail
        dynamic_update_harness(torch.tensor(
            [2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]),
                               torch.tensor([7.0, 8.0]),
                               torch.tensor([5]),
                               start_fn,
                               end_fn,
                               test_training=False)


def test_dynamic_update_two_dims():
    def start_fn(extra_in):
        return extra_in.to(torch.int32)

    def end_fn(extra_in):
        return extra_in.to(torch.int32) + 1

    dynamic_update_harness(
        torch.tensor([[2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
                      [8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0]]),
        torch.tensor([[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]]),
        torch.tensor([0]), start_fn, end_fn)


def test_dynamic_update_wrong_dim():
    def start_fn(extra_in):
        return extra_in.to(torch.int32)

    def end_fn(extra_in):
        return extra_in.to(torch.int32) + 1

    error_msg = (r"input and src tensors must have same dimensionality. "
                 r"\(2\) vs \(1\)")

    with pytest.raises(poptorch.Error, match=error_msg):
        dynamic_update_harness(
            torch.tensor([[2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
                          [8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0]]),
            torch.tensor([-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]),
            torch.tensor([0]), start_fn, end_fn)


def test_dynamic_update_two_dims_dim1():
    def start_fn(extra_in):
        return extra_in.to(torch.int32)

    def end_fn(extra_in):
        return extra_in.to(torch.int32) + 1

    dynamic_update_harness(torch.tensor(
        [[2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
         [8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0]]),
                           torch.tensor([[-1.0], [-1.0]]),
                           torch.tensor([4]),
                           start_fn,
                           end_fn,
                           dim=1)


def test_dynamic_update_3_dims_dim0():
    def start_fn(extra_in):
        return extra_in.to(torch.int32)

    def end_fn(extra_in):
        return extra_in.to(torch.int32) + 2

    input = torch.ones(3, 4, 5)
    src = torch.ones(2, 4, 5) * -1.0

    dynamic_update_harness(input,
                           src,
                           torch.tensor([1]),
                           start_fn,
                           end_fn,
                           dim=0)


def test_dynamic_update_3_dims_dim1():
    def start_fn(extra_in):
        return extra_in.to(torch.int32)

    def end_fn(extra_in):
        return extra_in.to(torch.int32) + 2

    input = torch.ones(3, 4, 5)
    src = torch.ones(3, 2, 5) * -1.0

    dynamic_update_harness(input,
                           src,
                           torch.tensor([1]),
                           start_fn,
                           end_fn,
                           dim=1)


def test_dynamic_update_3_dims_dim2():
    def start_fn(extra_in):
        return extra_in.to(torch.int32)

    def end_fn(extra_in):
        return extra_in.to(torch.int32) + 3

    input = torch.ones(3, 4, 5)
    src = torch.ones(3, 4, 3) * -1.0

    dynamic_update_harness(input,
                           src,
                           torch.tensor([2]),
                           start_fn,
                           end_fn,
                           dim=2)


def test_dynamic_update_wrong_dtype():
    t = torch.tensor([2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0])
    s = torch.tensor([-1])
    idx = torch.tensor([1])
    model = torch.nn.Module()
    model.forward = lambda t, s, e: poptorch.dynamic_update(t, s, 0, idx, 1)

    # Run on IPU.
    options = poptorch.Options()
    poptorch_model = poptorch.inferenceModel(model, options)

    error_msg = (r"input and src tensor must have same dtype\."
                 r" \(torch\.float32 vs torch.int32\)")

    with pytest.raises(poptorch.Error, match=error_msg):
        poptorch_model.compile(
            torch.randn_like(t),  # Use a random input
            s,
            idx + torch.tensor([20])  # Offset extra_in
        )


================================================
FILE: tests/tensor_ops_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import copy
from functools import partial
import re
import pytest
import torch
import helpers
import poptorch

# Tensors

# Creation ops (we don't support many of these)
# torch.numel, torch.tensor, torch.sparse_coo_tensor, torch.as_tensor, torch.as_strided, torch.from_numpy, torch.zeros,
# torch.zeros_like, torch.ones, torch.ones_like, torch.arange, torch.range, torch.linspace, torch.logspace, torch.eye,
# torch.empty, torch.empty_like, torch.empty_strided, torch.full, torch.full_like, torch.quantize_per_tensor, torch.quantize_per_channel,

# Indexing, Slicing, Joining, Mutating Ops
# torch.cat, torch.chunk, torch.gather, torch.index_select, torch.masked_select, torch.narrow, torch.nonzero, torch.reshape, torch.split,
# torch.squeeze, torch.stack, torch.t, torch.take, torch.transpose, torch.unbind, torch.unsqueeze, torch.where, torch._C.Generator,
# torch._C.Generator.device,


def zeros_and_ones_harness(model, dtype, is_like):
    assert dtype in [torch.float16, torch.float32, torch.int32, torch.bool]
    torch.manual_seed(42)

    # Calculating with ints/bools does not produce meaningful gradients
    test_training = not dtype in (torch.int32, torch.bool)

    inputs = [torch.tensor([1], dtype=dtype)]
    if is_like:
        inputs.append(torch.empty(3, 5, 1))
    inputs = tuple(inputs)

    options = poptorch.Options()
    if test_training:
        out_fn = lambda out: out[0]
        model = helpers.ModelWithWeights(model, inputs[0].shape, out_fn=out_fn)
        # We need to copy the model to use the original weights for native comparison
        model_copy = copy.deepcopy(model)
        # Run on IPU.
        poptorch_model = poptorch.trainingModel(model, options)
        poptorch_out, _ = poptorch_model(inputs)
        if dtype is torch.float16:
            # Promote CPU model and input
            model_copy = model_copy.float()
            inputs = tuple(input.float() for input in inputs)
            # promote IPU result to allow comparison
            poptorch_out = [pop.float() for pop in poptorch_out]
        native_out, _ = model_copy(inputs)
    else:
        native_out = model(*inputs)
        poptorch_model = poptorch.inferenceModel(model, options)
        poptorch_out = poptorch_model(*inputs)

    # Inference test - check outputs
    for native, pop in zip(native_out, poptorch_out):
        rtol = 0.001 if dtype is torch.float16 else 0.0001
        atol = 1e-4 if dtype is torch.float16 else 1e-5
        helpers.assert_allclose(expected=native,
                                actual=pop,
                                rtol=rtol,
                                atol=atol)

    if test_training:
        # Training test - check weights changed
        poptorch_model.assert_weights_changed()


zeros_and_ones_dtypes = [torch.float16, torch.float32, torch.int32, torch.bool]


@pytest.mark.parametrize("dtype", zeros_and_ones_dtypes)
def test_zeros_and_ones(dtype):
    class Model(torch.nn.Module):
        def forward(self, z):
            x = torch.zeros(3, 5, 1, dtype=dtype)
            y = torch.ones(3, 5, 1, dtype=dtype)

            return (x * y) + z, (y + x) + z

    zeros_and_ones_harness(Model(), dtype, False)


@pytest.mark.parametrize("dtype", zeros_and_ones_dtypes)
def test_new_zeros_and_new_ones(dtype):
    class Model(torch.nn.Module):
        def forward(self, z):
            x = z.new_zeros(3, 5, 1)
            y = z.new_ones(3, 5, 1)

            return (x * y) + z, (y + x) + z

    zeros_and_ones_harness(Model(), dtype, False)


@pytest.mark.parametrize("dtype", zeros_and_ones_dtypes)
def test_zeros_like_and_ones_like(dtype):
    class Model(torch.nn.Module):
        def forward(self, z, t):
            x = torch.zeros_like(t, dtype=dtype)
            y = torch.ones_like(t, dtype=dtype)

            return (x * y) + z, (y + x) + z

    zeros_and_ones_harness(Model(), dtype, True)


def fuzzy_compare_exceptions(e_cpu, e_ipu):
    """Compares error messages from CPU and IPU implementations
    if they do not match a fuzzy comparison (all words in the first line of the
    CPU exception are also in the IPU exception) an error is raised.
    """
    e_ipu_words = {word: i for i, word in enumerate(str(e_ipu).split())}
    # Only compare the first line (The following lines are usually a stacktrace)
    cpu_msg = str(e_cpu).split("\n")[0]
    if not all(
            e_ipu_words.get(word, -1) >= i
            for i, word in enumerate(cpu_msg.split())):
        raise ValueError("CPU and IPU error messages did not match: "
                         f"'{cpu_msg}' not in '{e_ipu}'") from e_ipu
    print(f"CPU and IPU error messages did match: '{cpu_msg}' in '{e_ipu}'")


def op_harness(op,
               *inputs,
               test_training=True,
               assert_fn=None,
               out_fn=None,
               native_out=None,
               fuzzy_errors=False,
               allow_native_errors=True):
    """The op harness allows to test the native torch API against poptorch.

    This function wraps an operation into a model and allows training and
    inference comparisons between py and poptorch.
    This function returns without errors when tensors are almost equal
    or the IPU and CPU implementation provide the same error messages.
    """

    def exception_catcher(model, *inputs, can_raise_exception=True):
        __tracebackhide__ = True  # pylint: disable=W0612
        op_raises_exception = False
        try:
            if test_training:
                native_out, _ = model(*inputs)
            else:
                native_out = model(*inputs)
        except Exception as e:  # pylint: disable=W0703
            if not can_raise_exception:
                raise
            native_out = ("error", e)
            op_raises_exception = True
            assert not poptorch.poptorch_core.isCompilingWithDispatcher(), (
                "[Internal] Clean up failed: dispatcher still active")
        return native_out, op_raises_exception

    if assert_fn is None:

        def assert_fn(native_out, poptorch_out):
            if isinstance(native_out, tuple):
                for native, pop in zip(native_out, poptorch_out):
                    helpers.assert_allclose(expected=native, actual=pop)
            else:
                helpers.assert_allclose(expected=native_out,
                                        actual=poptorch_out)

    op_raises_exception = False
    options = poptorch.Options()
    if test_training:
        # Set a fixed seed for the weights of the model
        torch.manual_seed(42)
        model = helpers.ModelWithWeights(op, inputs[0].shape, out_fn=out_fn)

        # Run on CPU.
        if native_out is None:
            native_out, op_raises_exception = exception_catcher(model, inputs)

            # native_out could be an alias of the input and so modified by
            # the poptorch_model, except if its an error
            if op_raises_exception:
                if not allow_native_errors:
                    raise native_out[1]
            elif isinstance(native_out, tuple):
                # pylint: disable=E1101
                native_out = tuple(n.clone().detach() for n in native_out)
            else:
                native_out = native_out.clone().detach()
        else:
            op_raises_exception = isinstance(
                native_out, tuple) and native_out[0] == "error"

        # Run on IPU.
        poptorch_model = poptorch.trainingModel(model, options=options)
        poptorch_out, ipu_raises = exception_catcher(
            poptorch_model, inputs, can_raise_exception=op_raises_exception)

        # Training test - check weights changed if no error was thrown
        try:
            poptorch_model.assert_weights_changed()
            assert not op_raises_exception, (
                "Weights changed despite errors being "
                "thrown in IPU evaluation.")
        except AssertionError:
            if not op_raises_exception:
                raise
    else:
        model = torch.nn.Module()
        model.forward = op

        # Run on CPU.
        if native_out is None:
            native_out, op_raises_exception = exception_catcher(model, *inputs)
            if op_raises_exception and not allow_native_errors:
                raise native_out[1]
        else:
            op_raises_exception = isinstance(
                native_out, tuple) and native_out[0] == "error"

        poptorch_model = poptorch.inferenceModel(model, options)
        # Run on IPU.
        poptorch_out, ipu_raises = exception_catcher(
            poptorch_model, *inputs, can_raise_exception=op_raises_exception)

    # Compare outputs
    if not ipu_raises and op_raises_exception:
        _, cpu_error = native_out
        raise RuntimeError("The torch and poptorch API do not match, "
                           "poptorch returned without error while torch failed"
                           f" with {cpu_error}") from cpu_error
    if fuzzy_errors and op_raises_exception:
        fuzzy_compare_exceptions(native_out[1], poptorch_out[1])
    elif op_raises_exception:
        _, cpu_error = native_out
        _, ipu_error = poptorch_out
        with pytest.raises(type(cpu_error),
                           match="^" + re.escape(f"{cpu_error}") + "$"):
            raise ipu_error
    else:
        assert_fn(native_out, poptorch_out)


class TestOpHarness:
    """Test the exception matching functionality of the op_harness function."""
    exact_error_check = "Regex pattern.*does not match"
    fuzzy_error_check = "CPU and IPU error messages did not match"
    op_harness = op_harness

    @pytest.fixture(autouse=True, params=[True, False])
    def training(self, request, monkeypatch):
        monkeypatch.setattr(self, "op_harness",
                            partial(op_harness, test_training=request.param))

    def test_fuzzy_error_mismatch(self):
        x = torch.randn(2, 3)

        def op(x):
            raise ValueError("Hi")

        with pytest.raises(ValueError, match=self.fuzzy_error_check):
            self.op_harness(op,
                            x,
                            native_out=("error", ValueError("Hey")),
                            fuzzy_errors=True)

    def test_error_mismatch(self):
        x = torch.randn(2, 3)

        def op(x):
            raise ValueError("Hi")

        with pytest.raises(AssertionError, match=self.exact_error_check):
            self.op_harness(op, x, native_out=("error", ValueError("Hey")))

    def test_exact_match(self):
        x = torch.randn(2, 3)

        def op(x):
            raise ValueError("Hi")

        self.op_harness(op, x)

    def test_fuzzy_match(self):
        x = torch.randn(2, 3)

        def op(x):
            raise ValueError("Hi Hey")

        self.op_harness(op,
                        x,
                        native_out=("error", ValueError("Hey")),
                        fuzzy_errors=True)

    def test_fuzzy_mismatch(self):
        x = torch.randn(2, 3)

        def op(x):
            raise ValueError("Hi")

        with pytest.raises(ValueError, match=self.fuzzy_error_check):
            self.op_harness(op,
                            x,
                            native_out=("error", ValueError("Hey Hi")),
                            fuzzy_errors=True)

    def test_reject_fuzzy_match_without_fuzzy_option(self):
        x = torch.randn(2, 3)

        def op(x):
            raise ValueError("Hi Hey")

        with pytest.raises(AssertionError, match=self.exact_error_check):
            self.op_harness(op, x, native_out=("error", ValueError("Hey")))

    def test_reject_exception_if_not_native(self):
        x = torch.randn(2, 3)
        error = ValueError("Hi Hey")

        def op(x):
            raise error

        with pytest.raises(type(error), match=f"{error}"):
            self.op_harness(op, x, native_out=(1))

    def test_no_ipu_exception_with_native_exception(self):
        x = torch.randn(2, 3)
        error = ValueError("Hi Hey")

        def op(x):
            return torch.roll(x, 1)

        with pytest.raises(RuntimeError, match=f"{error}"):
            self.op_harness(op, x, native_out=("error", error))


# Note: Many of the following operations don't depend on the values of the tensors
# but we still need to fix the random seed for any op with randomly generated values
# so that it's guaranteed that weights change after one training step


@pytest.mark.parametrize("dim", [0, 1])
@pytest.mark.parametrize(
    "dtypes", [
        [torch.float] * 3,
        [torch.int] * 3,
        [torch.int, torch.float],
        [torch.float, torch.int],
    ],
    ids=["all_floats", "all_ints", "int,float", "float,int"])
def test_cat(dim, dtypes):
    torch.manual_seed(42)
    # Cannot control the type of the first tensor as it needs to be
    # torch.float32 to be a valid input to the Linear layer used in
    # op_harness.
    first_input = torch.randn(2, 3)
    tensors = [torch.randn(2, 3).to(dtype=dtype) for dtype in dtypes]

    op = lambda *xs: torch.cat(xs, dim=dim)
    op_harness(op, first_input, *tensors, allow_native_errors=False)


@pytest.mark.parametrize("dim", [0, 1])
def test_cat_transpose(dim):
    """This combination of ops without ImplicitCasting causes the code
    to crash out."""
    torch.manual_seed(42)
    floatTensor = torch.randn(2, 3).to(dtype=torch.float)
    intTensor = torch.randn(2, 3).to(dtype=torch.int)

    op = lambda floatTensor, intTensor: torch.cat((intTensor, floatTensor),
                                                  dim=dim).transpose(1, 0)

    op_harness(op, floatTensor, intTensor, allow_native_errors=False)


@pytest.mark.parametrize("dim_size", [11, 12, 13])
def test_chunk(dim_size):
    torch.manual_seed(42)
    x = torch.randn(dim_size)

    op = lambda x: torch.chunk(x, 6)

    op_harness(op, x, out_fn=lambda x: x[0])


def test_cat_chunk_slice():
    def forward(x, mems):
        index = 8
        cat = torch.cat([mems, x], 0)
        split, _ = torch.chunk(cat, 2, dim=2)
        split2 = split[index:]
        return split2

    mems = torch.randn(1600, 1, 10, 10, 5)
    x = torch.randn(8, 1, 10, 10, 5)

    op = forward
    op_harness(op, x, mems, test_training=False)


def test_cat_chunk_slice_multiple_slices():
    def forward(x, mems):
        index = 8
        cat = torch.cat([mems, x], 0)
        _, _, split2, _, _ = torch.chunk(cat, 5, dim=2)
        split5 = split2[index:]
        return split5

    mems = torch.randn(1600, 1, 10, 10, 5)
    x = torch.randn(8, 1, 10, 10, 5)

    op = forward
    op_harness(op, x, mems, test_training=False)


def fast_gather_last_dim(data, idx):
    assert poptorch.ipuHardwareIsAvailable(), \
           "Hardware IPU needed to compile this FastGatherLastDim custom op"
    out = None
    if poptorch.isRunningOnIpu():
        target = torch.zeros(idx.shape).type_as(data)
        target.requires_grad_()
        o = poptorch.custom_op([data, idx],
                               "FastGatherLastDim",
                               "poptorch.custom_ops",
                               1,
                               example_outputs=[target],
                               attributes={})
        out = o[0]
    else:
        out = torch.gather(data, -1, idx)
    return out


@pytest.mark.ipuHardwareRequired
def test_fastgather_3dim():
    torch.manual_seed(42)
    shape = (9, 11, 6)
    input = torch.randn(shape)
    indices = torch.randint(0, 6, shape)
    op_harness(fast_gather_last_dim, input, indices)

    # Gather index last dim smaller than input last dim
    indices = torch.randint(0, 6, (9, 11, 3))
    op_harness(fast_gather_last_dim, input, indices)

    # Gather index different shape should fail
    indices = torch.randint(0, 6, (9, 1, 6))
    with pytest.raises(poptorch.poptorch_core.Error):
        op_harness(fast_gather_last_dim, input, indices)

    # Gather index different rank should fail
    indices = torch.randint(0, 6, (11, 6))
    with pytest.raises(poptorch.poptorch_core.Error):
        op_harness(fast_gather_last_dim, input, indices)


@pytest.mark.parametrize("dim", [0, 1, 2, -1, -2])
@pytest.mark.parametrize("larger_index", [True, False])
def test_gather_3dim(dim, larger_index):
    torch.manual_seed(42)
    shape = (9, 11, 6)
    input = torch.randn(shape)

    indices = torch.randint(0, 6, shape)
    op = lambda x, y: torch.gather(x, dim, y)
    op_harness(op, input, indices)

    small_shape = (7, 9, 5)
    if larger_index:
        larger_dims = list(small_shape)
        larger_dims[dim] = shape[dim] + 1
        small_shape = tuple(larger_dims)
    indices = torch.randint(0, 6, small_shape)
    op = lambda x, y: torch.gather(x, dim, y)
    op_harness(op, input, indices)


@pytest.mark.parametrize("dim", [0, 1, 2, 3])
@pytest.mark.parametrize("larger_index", [True, False])
def test_gather_4dim(dim, larger_index):
    torch.manual_seed(42)
    shape = (5, 8, 6, 7)
    input = torch.randn(shape)

    indices = torch.randint(0, 5, shape)
    op = lambda x, y: torch.gather(x, dim, y)
    op_harness(op, input, indices)

    small_shape = (4, 5, 2, 6)
    if larger_index:
        larger_dims = list(small_shape)
        larger_dims[dim] = shape[dim] + 1
        small_shape = tuple(larger_dims)
    indices = torch.randint(0, 5, small_shape)
    op = lambda x, y: torch.gather(x, dim, y)
    op_harness(op, input, indices)


@pytest.mark.parametrize("dim", [0, 1, 2, 3, 4])
@pytest.mark.parametrize("larger_index", [True, False])
def test_gather_5dim(dim, larger_index):
    torch.manual_seed(42)
    shape = (3, 3, 3, 3, 3)
    input = torch.randn(shape)

    indices = torch.randint(0, 3, shape)
    op = lambda x, y: torch.gather(x, dim, y)
    op_harness(op, input, indices)

    small_shape = (2, 2, 2, 2, 2)
    if larger_index:
        larger_dims = list(small_shape)
        larger_dims[dim] = shape[dim] + 1
        small_shape = tuple(larger_dims)
    indices = torch.randint(0, 3, small_shape)
    op = lambda x, y: torch.gather(x, dim, y)
    op_harness(op, input, indices)


@pytest.mark.parametrize("dim", range(-3, 3))
@pytest.mark.parametrize("reduce", [None, "add", "multiply"])
def test_scatter(dim, reduce):
    torch.manual_seed(42)
    dim_length = 3
    shape = (dim_length, ) * 3

    input = torch.randn(shape)
    indices = torch.randint(dim_length, shape)
    source = torch.randn(shape)

    op = lambda inp, idx, src: inp.scatter(dim, idx, src, reduce)
    op_harness(op, input, indices, source)


@pytest.mark.parametrize("reduce", [None, 'add'])
@pytest.mark.parametrize("value", [1, 1.1])
def test_scatter_value_inplace(reduce, value):
    torch.manual_seed(42)
    shape = (6, 6)

    input = torch.randn(shape).to(torch.float32)
    indices = torch.randint(6, (1, 6)).squeeze()

    def op(inp, idx, reduce, value):
        out = torch.zeros((idx.size(0), 6), dtype=inp.dtype)
        if reduce is None:
            out.scatter_(1, idx.unsqueeze(1), value)
        else:
            out.scatter_(1, idx.unsqueeze(1), value, reduce=reduce)
        return out.mul_(inp)

    op_harness(op, input, indices, reduce, value)


@pytest.mark.parametrize("reduce", [None, 'add'])
@pytest.mark.parametrize("value", [1, 1.1])
def test_scatter_value(reduce, value):
    torch.manual_seed(42)
    shape = (6, 6)

    input = torch.randn(shape).to(torch.float32)
    indices = torch.randint(6, (1, 6)).squeeze()

    def op(inp, idx, reduce, value):
        out = torch.zeros((idx.size(0), 6), dtype=inp.dtype)
        if reduce is None:
            out = torch.scatter(out, 1, idx.unsqueeze(1), value)
        else:
            out = torch.scatter(out, 1, idx.unsqueeze(1), value, reduce=reduce)
        return out.mul_(inp)

    op_harness(op, input, indices, reduce, value)


@pytest.mark.parametrize("dim", range(-3, 3))
@pytest.mark.parametrize("reduce", [None, "add", "multiply"])
def test_scatter_(dim, reduce):
    torch.manual_seed(42)
    dim_length = 3
    shape = (dim_length, ) * 3

    input = torch.randn(shape)
    indices = torch.randint(dim_length, shape)
    source = torch.randn(shape)

    op = lambda inp, idx, src: inp.scatter_(dim, idx, src, reduce)
    op_harness(op, input, indices, source)


@pytest.mark.parametrize("dim", range(-3, 3))
@pytest.mark.parametrize("reduce", [None, "add", "multiply"])
def test_scatter_scalar(dim, reduce):
    torch.manual_seed(42)
    dim_length = 3
    shape = (dim_length, ) * 3

    input = torch.randn(shape)
    indices = torch.randint(dim_length, shape)
    source = 5.0

    op = lambda inp, idx: inp.scatter(dim, idx, source, reduce)
    op_harness(op, input, indices)


@pytest.mark.parametrize("reduce", [None, "add", "multiply"])
def test_scatter_different_src_index_shapes(reduce):
    indices = torch.tensor([[0, 1, 2, 0]]).long()
    input = torch.zeros(3, 5, dtype=torch.float32)
    dim = 0

    op = lambda inp, idx: inp.scatter_(
        dim, idx,
        torch.arange(1, 11, dtype=torch.float32).reshape((2, 5)), reduce)
    op_harness(op, input, indices, test_training=False)


def test_reshape():
    op = lambda x: torch.reshape(x, (1, 1, 2, 2))

    x = torch.arange(4.)

    op_harness(op, x)


def test_constExpr_reshape():
    a = 2
    b = 3
    c = 4

    class Model(torch.nn.Module):
        def forward(self, input):
            # Use a constant in order for this code to be run in the
            # ConstExpr pass
            mask = torch.ones(b, a, device=input.device).to(torch.bool)
            mask = mask.unsqueeze(1)
            # The expand on CPU will be implemented by setting the
            # stride to 0
            mask = mask.expand([-1, c, -1])
            mask = mask.reshape([b * c, a])
            return mask * input[0]

    input = torch.randn(1).to(torch.bool)
    native = Model()
    out_native = native(input)
    opts = poptorch.Options()
    m = poptorch.inferenceModel(Model(), opts)
    out_ipu = m(input)
    helpers.assert_allequal(actual=out_ipu, expected=out_native)


@pytest.mark.parametrize("split_size_or_sections",
                         (1, 5, 6, 20, [10, 10], [19, 1]))
def test_split(split_size_or_sections):
    torch.manual_seed(42)
    x = torch.randn(20, 10)
    op = lambda x: torch.split(x, split_size_or_sections)

    op_harness(op, x, out_fn=lambda x: x[0])


def test_split_singleton():
    torch.manual_seed(42)
    x = torch.randn(1, 4, 3, 1)
    op = lambda x: torch.split(x, 1, 1)[0]

    op_harness(op, x)


@pytest.mark.parametrize("inplace", [True, False])
def test_squeeze(inplace):
    torch.manual_seed(42)
    x = torch.randn(1, 1, 5, 1, 10, 1)

    def f(t):
        if inplace:
            t.squeeze_()
            return t
        return torch.squeeze(t)

    op_harness(f, x)


def test_t():
    torch.manual_seed(42)
    x = torch.randn(20, 10)

    op_harness(torch.t, x)


def test_transpose():
    torch.manual_seed(42)
    x = torch.randn(3, 2, 5, 2)
    op = lambda x: torch.transpose(x, 3, 0)

    op_harness(op, x)


def test_transpose_negative_dims():
    torch.manual_seed(42)
    x = torch.randn(3, 2, 5, 2)
    y = torch.randn(2, 2, 5, 3)
    op = lambda x, y: torch.transpose(x, -1, 0) + y

    op_harness(op, x, y, test_training=False)


def test_numpy_T():
    torch.manual_seed(42)
    op = lambda x: x.T

    x = torch.randn(3, 2, 5, 4)
    op_harness(op, x)

    x = torch.randn(5)
    op_harness(op, x)


def test_unsqueeze():
    torch.manual_seed(42)
    x = torch.randn(3, 2, 5, 2)
    op = lambda x: torch.unsqueeze(x, 1)

    op_harness(op, x)


def test_broadcast_to():
    torch.manual_seed(42)
    x = torch.randn(3, 1)
    op = lambda x: torch.broadcast_to(x, (3, 4))

    op_harness(op, x)


@pytest.mark.parametrize(
    "shape",
    [
        (2, 4, 4),  # standard
        (2, 2, 2, 4, 4),  # more dimensions
        (2, 4, -1),  # negative dimension
        (2, 2, -1, 2, 4),  # negative & extra dimensions
    ])
def test_expand(shape):
    torch.manual_seed(42)
    x = torch.randn(2, 1, 4)
    op = lambda x: x.expand(shape)

    op_harness(op, x)


@pytest.mark.parametrize("shape", [(5), (1, 2, 3)])
def test_expand_scalar(shape):
    torch.manual_seed(42)
    x = torch.randn(())
    op = lambda x: x.expand(shape)

    op_harness(op, x, test_training=False)


def test_expand_as():
    torch.manual_seed(42)
    x = torch.randn(3, 1)
    y = torch.randn(3, 4)
    op = lambda x, y: x.expand_as(y)

    op_harness(op, x, y)


def test_flatten():
    torch.manual_seed(42)
    x = torch.randn(3, 1)

    op_harness(torch.flatten, x)


def test_view():
    torch.manual_seed(42)
    x = torch.randn(30, 5)
    op = lambda x: x.view((15, 2, 5))

    op_harness(op, x)


@pytest.mark.parametrize("input_shapes", [(1, ), (2, ), (2, 2), (2, 3, 4)])
def test_size(input_shapes):
    x = torch.ones(*input_shapes)
    # Use size as input to another operation to workaround pruning error
    op = lambda x: x.view(x.size())

    op_harness(op, x)


input_shapes = [(1, 4, 5), (2, ), (2, 2), (2, 3, 4, 1, 3, 4)]
dtypes = [torch.float, torch.float16, torch.int32]


@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("DEBUG")
@pytest.mark.parametrize("input_shapes", input_shapes)
@pytest.mark.parametrize("t", dtypes)
def test_fill(capfd, input_shapes, t):
    float_test_num = 1.9375

    def op(x):
        value = 42 if x.dtype == torch.int32 else float_test_num
        x = x + 0  # Ensure x is not modified in place
        # Add zero to all results to avoid pruning the whole graph
        return x.fill_(value) + 0, torch.full_like(x, value) + 0, torch.full(
            input_shapes, value, dtype=x.dtype) + 0, x.new_full(
                input_shapes, value, dtype=x.dtype) + 0, torch.ones_like(x) + 0

    x = torch.ones(*input_shapes, dtype=t)

    native_out = tuple(
        torch.full(input_shapes, float_test_num)
        for _ in range(3)) if t == torch.float16 else None

    def assert_fn(native_out, poptorch_out):
        for native, pop in zip(native_out, poptorch_out):
            if t == torch.float16:
                pop = pop.float()

            assert native.dtype == pop.dtype
            helpers.assert_allequal(expected=native, actual=pop)

    # Fill is non-differentiable so set test_training=False
    op_harness(op,
               x,
               test_training=False,
               assert_fn=assert_fn,
               native_out=native_out)
    testlog = helpers.LogChecker(capfd)
    testlog.assert_no_matches("expand")


def test_triu_in_constexpr():
    # triu is unsupported but the RHS should be reduced
    # to a constant before the op reaches PopART
    # canonicalisation
    def triu_inplace(x):
        # dispatches to aten::triu
        return x + torch.ones(3, 3).triu_()

    def triu_out(x):
        # dispatches to aten::triu.out
        return x + torch.triu(torch.ones(3, 3))

    x = torch.ones(3, 3)
    op_harness(triu_inplace, x, test_training=False)

    op_harness(triu_out, x, test_training=False)


@pytest.mark.parametrize("input_shapes", [
    ((10, 10), (10, 10), (10, 10)),
    ((10, 1, 10), (10, 10), (10, 10, 1)),
    ((), (), ()),
    ((10, 1, 10), (), (10, 10, 1)),
    ((), (10, 10), ()),
])
def test_where_broadcast(input_shapes):
    torch.manual_seed(42)

    cond_shape = input_shapes[0]
    x_shape = input_shapes[1]
    y_shape = input_shapes[2]

    class Model(torch.nn.Module):
        def forward(self, cond, x, y):
            return torch.where(cond, x, y)

    cond = torch.empty(cond_shape).bernoulli_().to(torch.bool)
    x = torch.randn(x_shape)
    y = torch.randn(y_shape)

    cpu_mod = Model()
    ipu_mod = poptorch.inferenceModel(cpu_mod)

    torch.testing.assert_close(actual=ipu_mod(cond, x, y),
                               expected=cpu_mod(cond, x, y))


@pytest.mark.parametrize("input_shapes", input_shapes)
@pytest.mark.parametrize("value", [0.666, -4.32, float("Inf"), float("-Inf")])
def test_masked_fill(input_shapes, value):
    torch.manual_seed(42)

    class Model(torch.nn.Module):
        def forward(self, x):
            fill_result = x.masked_fill(x > 0.5, value)
            where_result = torch.where(x > 0.5, x, torch.tensor(value))
            return fill_result, where_result

    x = torch.randn(*input_shapes)
    op_harness(Model(), x, out_fn=lambda x: x[0])


@pytest.mark.parametrize("input_shapes", [(1, ), (2, ), (3, 4), (1, 3, 4)])
@pytest.mark.parametrize("dim", [0, 1, 2])
def test_stack(input_shapes, dim):
    torch.manual_seed(42)

    if dim > len(input_shapes):
        pytest.skip()

    op = lambda *xs: torch.stack(xs, dim=dim)
    inputs = [torch.randn(*input_shapes) for _ in range(3)]

    op_harness(op, *inputs)


@pytest.mark.parametrize("input_shapes", [(1, ), (2, ), (2, 3), (1, 3, 4)])
@pytest.mark.parametrize("dims",
                         [[1], [3], [2, 1], [2, 3], [1, 1, 1], [3, 2, 4]])
def test_repeat(input_shapes, dims):
    if len(dims) < len(input_shapes):
        pytest.skip(
            "Number of dimensions of repeat dims can not be smaller than number"
            " of dimensions of tensor.")

    torch.manual_seed(42)

    op = lambda x: x.repeat(dims)
    a = torch.randn(*input_shapes)

    op_harness(op, a)


def test_repeat_training_input():
    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            # Dummy weights for training
            self.lin = torch.nn.Linear(2, 1)

        def forward(self, x):
            x = x.repeat(5, 2, 2)
            return x, poptorch.identity_loss(x**2, reduction="sum")

    torch.manual_seed(42)

    input = torch.randn((10, 1, 1))

    model = Model()
    options = poptorch.Options()
    poptorch_model = poptorch.trainingModel(model, options=options)

    native_out, _ = model(input)
    poptorch_out, _ = poptorch_model(input)

    helpers.assert_allclose(expected=native_out, actual=poptorch_out)


@pytest.mark.parametrize("input_shapes", [(1, ), (2, ), (2, 3), (1, 3, 4)])
@pytest.mark.parametrize("dtype", [torch.float, torch.int])
def test_clone_one(input_shapes, dtype):
    torch.manual_seed(42)

    op = lambda x: x.clone()

    x = torch.randn(*input_shapes)

    def assert_fn(native_out, poptorch_out):
        for pop, native in zip(poptorch_out, native_out):
            assert native.dtype == pop.dtype
            helpers.assert_allclose(expected=native, actual=pop)

    # Calculating with integers does not produce meaningful gradients
    test_training = dtype is torch.float
    op_harness(op, x, test_training=test_training, assert_fn=assert_fn)


def test_clone_two():
    torch.manual_seed(42)

    class Model(torch.nn.Module):
        def forward(self, x, y, z):
            x += y
            x_clone = x.clone()
            x += y
            x_clone += z

            return x, x_clone

    dummy_x = torch.randn([2, 3])
    dummy_y = torch.randn([2, 3])
    dummy_z = torch.randn([2, 3])

    model = Model()

    native_out = model(dummy_x.clone(), dummy_y.clone(), dummy_z.clone())

    options = poptorch.Options()
    poptorch_model = poptorch.inferenceModel(model, options)
    poptorch_out = poptorch_model(dummy_x.clone(), dummy_y.clone(),
                                  dummy_z.clone())

    helpers.assert_allclose(expected=native_out, actual=poptorch_out)


@pytest.mark.parametrize("input_shapes", [(1, ), (2, ), (2, 3), (1, 3, 4)])
@pytest.mark.parametrize("dtype", [torch.float, torch.half, torch.int])
def test_copy_(input_shapes, dtype):
    torch.manual_seed(42)

    op = lambda x, y: y.copy_(x)

    x = torch.randn(*input_shapes)
    y = torch.empty_like(x, dtype=dtype)

    def assert_fn(native_out, poptorch_out):
        for pop, native in zip(poptorch_out, native_out):
            helpers.assert_allclose(expected=native,
                                    actual=pop,
                                    check_dtype=True)

    # Calculating with integers does not produce meaningful gradients
    test_training = dtype is torch.float
    op_harness(op, x, y, test_training=test_training, assert_fn=assert_fn)


@pytest.mark.parametrize("shifts,dims", [(1, 0), (-1, 0), (10, 1), (-10, 1),
                                         (0, 2), ((1, 1), (0, 1)),
                                         ((1, -1), (1, 2)), ((-3, -4), (0, 2)),
                                         ((1, 2, 3), (0, 1, 2)),
                                         ((-1, -2, -3), (0, 1, 2)), (5, None),
                                         (-3, None), (1, -1), (1, -3), (1, -4),
                                         (1, 3)])
def test_roll(shifts, dims):
    torch.manual_seed(0)
    op = lambda x: x.roll(shifts, dims)
    x = torch.randn((2, 3, 4))
    op_harness(op, x, fuzzy_errors=True)


@pytest.mark.parametrize("dims", [0, 1, -1])
def test_flip(dims):
    torch.manual_seed(0)
    op = lambda x: x.flip(dims)
    x = torch.randn((2, 3))
    op_harness(op, x)


@pytest.mark.parametrize("with_clone", [True, False])
@pytest.mark.parametrize("with_detach", [True, False])
def test_detach_and_clone(with_clone, with_detach):
    torch.manual_seed(42)

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.first_layer = torch.nn.Linear(10, 10)
            self.second_layer = torch.nn.Linear(10, 10)
            self.loss = torch.nn.MSELoss()

        def forward(self, x, target):
            out = self.first_layer(x)
            if with_clone:
                out = out.clone()
            if with_detach:
                out = out.detach()

            out = self.second_layer(out)
            loss = self.loss(out, target)
            return out, loss

    model = Model()
    options = poptorch.Options()
    poptorch_model = poptorch.trainingModel(model,
                                            options=options,
                                            optimizer=torch.optim.SGD(
                                                model.parameters(), lr=0.01))

    target = torch.ones(10)
    input = torch.randn(10)

    bias_at_start = model.first_layer.bias.clone().data
    weight_at_start = model.first_layer.weight.clone().data

    for _ in range(100):
        _, _ = poptorch_model(input, target)

    if with_detach:
        assert (bias_at_start == model.first_layer.bias).all()
        assert (weight_at_start == model.first_layer.weight).all()
    else:
        assert (bias_at_start != model.first_layer.bias).all()
        assert (weight_at_start != model.first_layer.weight).all()


def test_torch_inference_mode():
    class SimpleModel(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.fc1 = torch.nn.Linear(10, 10)
            self.loss = torch.nn.MSELoss()

        def forward(self, x):
            x = self.fc1(x)
            x = x[torch.arange(4, device=x.device), :]
            loss = self.loss(x, x)
            return loss

    model = SimpleModel()
    options = poptorch.Options()
    model = poptorch.inferenceModel(model.train(), options=options)

    x = torch.rand(4, 10)

    with torch.inference_mode():
        model.compile(x=x)


@helpers.printCapfdOnExit
def test_requires_grad_true(capfd):
    model = torch.nn.Linear(1, 1)
    options = poptorch.Options()
    poptorch_model = poptorch.inferenceModel(model, options)

    poptorch_model(torch.tensor([0.0], requires_grad=True))
    log = helpers.LogChecker(capfd)
    log.assert_contains(
        "Input tensor has requires_grad=True set. "
        "This tensor will be detached because backward pass via "
        "inputs is not supported.")


@pytest.mark.parametrize("args", [(5, ), (1, ), (5, 10),
                                  (5, 10, 2), (10, 1, -1), (10, 1, -2),
                                  (1, 5, 10), (2.5, ), (2, 10.), (2, 10, 3.4)])
def test_arange(args):
    torch.manual_seed(42)

    class Model(torch.nn.Module):
        def forward(self, a):
            return torch.arange(*args) + a

    options = poptorch.Options()

    cpu_model = Model()
    ipu_model = poptorch.inferenceModel(cpu_model, options)

    a = torch.randn(())

    cpu_res = cpu_model(a)
    ipu_res = ipu_model(a)

    helpers.assert_allclose(actual=ipu_res, expected=cpu_res)


@pytest.mark.parametrize("args", [(5, ), (5, 10), (5, 10, 2), (2.5, ),
                                  (2, 10.), (2, 10, 3.4)])
def test_arange_types(args):
    torch.manual_seed(42)

    should_be_float = any(isinstance(a, float) for a in args)

    class Model(torch.nn.Module):
        def forward(self, a):
            res = torch.arange(*args)
            assert res.is_floating_point() == should_be_float
            return torch.index_select(res, 0, a)  # So the graph's not empty

    options = poptorch.Options()

    cpu_model = Model()
    ipu_model = poptorch.inferenceModel(cpu_model, options)

    a = torch.tensor([0])

    cpu_res = cpu_model(a)
    ipu_res = ipu_model(a)

    exp_dtype = cpu_res.dtype
    if exp_dtype == torch.int64:
        exp_dtype = torch.int32
    elif exp_dtype == torch.float64:
        exp_dtype = torch.float32

    # NOTE: this may depend on torch.get_default_dtype()
    assert ipu_res.dtype == exp_dtype


@pytest.mark.parametrize("input_shape,dim,size,step",
                         [((7, ), 0, 2, 1), ((7, ), 0, 2, 2),
                          ((10, ), 0, 2, 2), ((10, ), 0, 2, 1),
                          ((5, 5), 0, 2, 2), ((5, 5), 1, 2, 2),
                          ((3, 2, 1), 0, 2, 2), ((10, 10, 10), 1, 5, 2)])
def test_unfold(input_shape, dim, size, step):
    torch.manual_seed(0)
    op = lambda x: x.unfold(dim, size, step)
    x = torch.randn(input_shape)
    op_harness(op, x)


@pytest.mark.parametrize("op", [
    lambda input: torch.take_along_dim(input, torch.argmax(input)),
    lambda input: torch.take_along_dim(
        input, torch.argsort(input, dim=1), dim=1),
    lambda input: torch.take_along_dim(
        input, torch.tensor([[2, 0, 1]], dtype=torch.int64), dim=1),
    lambda input: torch.take_along_dim(
        input, torch.tensor([[2, 0, 1, 0]], dtype=torch.int64), dim=1),
])
def test_take_along_dim(op):
    torch.manual_seed(42)

    input = torch.tensor([[10, 30, 20], [60, 40, 50]]).float()
    op_harness(op, input)


================================================
FILE: tests/test_doc_urls.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.

import glob
import os
import re
import requests

DOC_FOLDER = "../docs/user_guide"
URL_PATTERN = re.compile(r"\bhttps?:[^\s>]+")

# URLs which don't exist yet (e.g documentation for a future release) can be
# added to the dictionary of exceptions:
PRE_RELEASE_URLS = {
}


def get_all_links_from_file(rst_file_name):
    # Known issue: if a link is split over multiple lines, only the first line
    # (containing 'http') will be considered matched.

    print(f"Reading {rst_file_name}")

    all_links = []

    # Force as extended ASCII to avoid decoding errors:
    # assume all urls are made of 8-bit chars only
    with open(rst_file_name, "r", encoding="latin-1") as rst_file:
        for line in rst_file:
            matches = URL_PATTERN.findall(line)
            for match in matches:
                all_links.append(match)

    return all_links


def convert_to_internal(url):
    for forwarder in PRE_RELEASE_URLS:
        if url.startswith(forwarder):
            print("Will try pre-release URL:")
            return True, url.replace(forwarder, PRE_RELEASE_URLS[forwarder], 1)

    return False, url


def check_url_works(url):
    print(f"Testing {url}")

    try:
        r = requests.head(url)
    except (requests.exceptions.ConnectionError, requests.exceptions.Timeout):
        # Allow the test to succeed with intermittent issues.
        # (TooManyRedirects is not caught as could be a broken url.)
        return None

    code = r.status_code
    message = requests.status_codes._codes[code][0]  # pylint: disable=protected-access

    print(f"{message} ({code})")

    if r.status_code == 302:
        check_url_works(r.headers["Location"])
    else:
        # Allow any non 4xx status code, as other failures could be temporary
        # and break the CI tests.
        if r.status_code >= 400 and r.status_code < 500:
            return url, message, code
        print()

    return None


def test_all_links():
    user_guide_path = os.path.realpath(
        os.path.join(os.path.dirname(os.path.abspath(__file__)), DOC_FOLDER)
    )
    failed_urls = []

    for rst_file in glob.glob(f"{user_guide_path}/*.rst"):
        for url in get_all_links_from_file(rst_file):
            url_result = check_url_works(url)

            # If URL didn't work, check internal repos for pending release
            if url_result is not None:
                is_pre_release, internal_url = convert_to_internal(url)
                if is_pre_release:
                    url_result = check_url_works(internal_url)

            if url_result is not None:
                url, message, code = url_result
                failed_urls.append(f"{url}: {message} ({code})")

            print()

    no_failures = not failed_urls
    assert no_failures, "\n".join(failed_urls)


================================================
FILE: tests/test_perf_counters.py
================================================
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.
import torch
import pytest
import poptorch


class Model(torch.nn.Module):
    def forward(self, x, y):
        return torch.matmul(x, y)


def assert_perf_counter_size(perf, inputs, outputs, steps, outsteps=None):
    def assert_size(perf, elems, steps):
        assert len(perf) == elems
        for elem in perf:
            assert len(elem) == steps

    outsteps = outsteps or steps

    assert_size(perf['input'], inputs, steps)
    assert_size(perf['input_complete'], inputs, steps)
    assert_size(perf['output'], outputs, outsteps)
    assert_size(perf['output_complete'], outputs, outsteps)


def assert_latency_values(model):
    def check(latency):
        (minimum, maximum, average) = latency
        assert minimum <= average
        assert average <= maximum

    host2ipu = model.getHostIpuLatency()
    compute = model.getComputeLatency()
    ipu2host = model.getIpuHostLatency()
    round_trip = model.getLatency()

    check(host2ipu)
    check(compute)
    check(ipu2host)
    check(round_trip)


def test_simple():
    x = torch.randn(100, 100)
    y = torch.randn(100, 100)
    model = Model()
    poptorch_model = poptorch.inferenceModel(model)
    poptorch_model(x, y)

    perf = poptorch_model.getPerfCounters()
    assert_perf_counter_size(perf, 2, 1, 1)
    assert_latency_values(poptorch_model)


def test_steps():
    x = torch.randn(10, 100, 100)
    y = torch.randn(10, 100, 100)
    model = Model()
    opts = poptorch.Options().deviceIterations(10)
    poptorch_model = poptorch.inferenceModel(model, opts)
    poptorch_model(x, y)

    perf = poptorch_model.getPerfCounters()
    assert_perf_counter_size(perf, 2, 1, 10)
    assert_latency_values(poptorch_model)


@pytest.mark.ipuHardwareRequired
def test_replicas():
    x = torch.randn(4, 100, 100)
    y = torch.randn(4, 100, 100)
    model = Model()
    opts = poptorch.Options().replicationFactor(4)
    poptorch_model = poptorch.inferenceModel(model, opts)
    poptorch_model(x, y)

    perf = poptorch_model.getPerfCounters()
    assert_perf_counter_size(perf, 2, 1, 4)
    assert_latency_values(poptorch_model)


@pytest.mark.parametrize("mode, period", [(poptorch.OutputMode.Final, 1),
                                          (poptorch.OutputMode.All, 1),
                                          (poptorch.OutputMode.Sum, 1),
                                          (poptorch.OutputMode.EveryN, 2)])
@pytest.mark.parametrize("steps", [2, 4])
@pytest.mark.parametrize("replicas", [1, 2])
@pytest.mark.ipuHardwareRequired
def test_inference(mode, period, steps, replicas):
    model = Model()
    opts = poptorch.Options()
    opts.outputMode(mode, period)
    opts.deviceIterations(steps)
    opts.replicationFactor(replicas)
    poptorch_model = poptorch.inferenceModel(model, opts)

    torch.manual_seed(42)
    x = torch.randn(16, 100, 100)
    y = torch.randn(16, 100, 100)
    poptorch_model(x, y)
    perf = poptorch_model.getPerfCounters()

    outsteps = steps * replicas
    if mode in [poptorch.OutputMode.Final, poptorch.OutputMode.Sum]:
        outsteps = replicas
    elif mode is poptorch.OutputMode.EveryN:
        outsteps = steps // period * replicas
    assert_perf_counter_size(perf, 2, 1, steps * replicas, outsteps)
    assert_latency_values(poptorch_model)


@pytest.mark.parametrize("mode, period", [(poptorch.OutputMode.Final, 1),
                                          (poptorch.OutputMode.All, 1),
                                          (poptorch.OutputMode.Sum, 1),
                                          (poptorch.OutputMode.EveryN, 2)])
@pytest.mark.parametrize("steps", [2, 4])
@pytest.mark.parametrize("accums", [1, 2])
@pytest.mark.parametrize("replicas", [1, 2])
@pytest.mark.ipuHardwareRequired
def test_training(mode, period, steps, accums, replicas):
    torch.manual_seed(42)
    inputs = torch.randn(16, 100)
    targets = torch.randn(16, 100)

    opts = poptorch.Options()
    opts.outputMode(mode, period)
    opts.deviceIterations(steps)
    opts.Training.gradientAccumulation(accums)
    opts.replicationFactor(replicas)

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.linear = torch.nn.Linear(100, 100)
            self.loss = torch.nn.MSELoss()

        def forward(self, data, target):
            out = self.linear(data)
            loss = self.loss(out, target)
            return out, loss

    model = Model()
    poptorch_model = poptorch.trainingModel(model, options=opts)

    poptorch_model(inputs, targets)
    perf = poptorch_model.getPerfCounters()

    outsteps = steps * accums * replicas
    if mode in [poptorch.OutputMode.Final, poptorch.OutputMode.Sum]:
        outsteps = replicas
    elif mode is poptorch.OutputMode.EveryN:
        outsteps = steps // period * accums * replicas

    assert_perf_counter_size(perf, 2, 2, steps * accums * replicas, outsteps)
    assert_latency_values(poptorch_model)


def test_synthetic_data():
    model = Model()
    opts = poptorch.Options()
    opts.deviceIterations(16)
    opts.enableSyntheticData(True)
    poptorch_model = poptorch.inferenceModel(model, opts)

    torch.manual_seed(42)
    x = torch.randn(16, 100, 100)
    y = torch.randn(16, 100, 100)
    poptorch_model(x, y)
    perf = poptorch_model.getPerfCounters()

    assert_perf_counter_size(perf, 2, 1, 0, 0)

    latency = poptorch_model.getLatency()
    assert latency == (0., 0., 0.)


================================================
FILE: tests/timeout_handler.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.
"""
ctest --timeout uses SIGKILL to kill processes after they time out
unfortunately this prevents Linux from generating a core dump.

So instead this script sends a SIGABRT to the process when it times out which
will create a core dump.

The second part of the script is so that the test appears as "Timeout" in the
ctest results (instead of Aborted): unfortunately there is no way to mark a
test as "Timeout in ctest, this can only be done if ctest detects the timeout
itself.

In order to achieve this we set TIMEOUT_AFTER_MATCH "1;TEST_TIMEOUT on all the
tests in ctest: it means ctest will consider a test to have timed out
(and kill it) if it doesn't complete within 1 second of printing the string
TEST_TIMEOUT.
"""

import subprocess
import signal
import sys
import time
import os

# Assuming the ctest --timeout argument is set to the same value: we want this
# one to kick in first, so remove 60 seconds from it.
timeout = int(os.environ.get("POPTORCH_TEST_TIMEOUT", "1000")) - 60
# Run the command passed
# start_new_session is used to create a new process group so that we can send a
# signal to the entire process group when we try to kill the test.
with subprocess.Popen(sys.argv[1:], start_new_session=True) as p:
    try:
        print("Setting timeout to %d seconds" % timeout, flush=True)
        p.wait(timeout=timeout)
    except subprocess.TimeoutExpired as e:
        print("Timeout after %d seconds" % timeout, flush=True)
        # Timeout: send an segmentation fault signal to generate a core dump.
        process_group = os.getpgid(p.pid)
        subprocess.run([  # pylint: disable=subprocess-run-check
            "gdb", "--batch", "--quiet", "-ex", "thread apply all bt", "-ex",
            "thread apply all py-bt", "-ex", "detach", "-ex", "quit", "-p",
            str(process_group)
        ])

        print("Sending signal to process group %d of process %d" %
              (process_group, p.pid),
              flush=True)
        os.killpg(process_group, signal.SIGSEGV)
        print("Waiting for aborted process...", flush=True)
        # Wait for the process to exit cleanly
        p.wait()
        # Signal to ctest it was a timeout
        print("TEST_TIMEOUT", flush=True)
        # give ctest some time to process the timeout
        time.sleep(60)
        print("ERROR: Shouldn't have reached this point", flush=True)
        # Note: in theory ctest should kill this process 1 second after TEST_TIMEOUT was printed.
    sys.exit(p.returncode)


================================================
FILE: tests/torch_nn_test.py
================================================
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import os
import sys
import torch
from torch.testing._internal.jit_metaprogramming_utils import get_all_nn_module_tests, get_nn_mod_test_name, get_nn_module_name_from_kwargs
import pytest
import helpers
import poptorch

# Importing jit_metaprogramming_utils changes the default type to
# double set it back to float.
torch.set_default_dtype(torch.float32)

# yapf: disable
# pylint: disable=line-too-long
EXPECTED_FAILURES = {
    # TODO(T26651) Popart feature request
    "test_nn_BatchNorm3d_not_affine": "Weights & bias are mandatory in Popart: No input found for input 1 of Op(ai.onnx.BatchNormalization:9, inputs=[Reshape:0], outputs=[]), but input is not optional",

    # TODO(T31811): Circular padding
    "test_nn_Padding12_1dcircular": "outputs_[i]->uses().empty() INTERNAL ASSERT FAILED",
    "test_nn_Padding31_1dcircular": "outputs_[i]->uses().empty() INTERNAL ASSERT FAILED",
    "test_nn_Padding33_1dcircular": "outputs_[i]->uses().empty() INTERNAL ASSERT FAILED",
    "test_nn_Padding1221_2dcircular": "outputs_[i]->uses().empty() INTERNAL ASSERT FAILED",
    "test_nn_Padding2322_2dcircular": "outputs_[i]->uses().empty() INTERNAL ASSERT FAILED",
    "test_nn_Padding3331_2dcircular": "outputs_[i]->uses().empty() INTERNAL ASSERT FAILED",
    "test_nn_Conv1d_circular_stride2_pad2": "margin of error",
    "test_nn_Conv2d_circular_stride2_pad2": "margin of error",

    # TODO(T26652): Popart feature request
    "test_nn_LayerNorm_3d_no_elementwise_affine": "Weights & bias are mandatory in Popart: No input found for input 1 of Op(ai.graphcore.GroupNormalization:1, inputs=[Flatten:0], outputs=[]), but input is not optional",
    "test_nn_LayerNorm_1d_no_elementwise_affine": "Weights & bias are mandatory in Popart: No input found for input 1 of Op(ai.graphcore.GroupNormalization:1, inputs=[Flatten:0], outputs=[]), but input is not optional",
    "test_nn_GroupNorm_1d_no_affine_IN": "Weights & bias are mandatory in Popart: No input found for input 1 of Op(ai.graphcore.GroupNormalization:1, inputs=[Flatten:0], outputs=[]), but input is not optional",
    "test_nn_GroupNorm_1d_no_affine_LN": "Weights & bias are mandatory in Popart: No input found for input 1 of Op(ai.graphcore.GroupNormalization:1, inputs=[Flatten:0], outputs=[]), but input is not optional",
    "test_nn_GroupNorm_2d_no_affine_IN": "Weights & bias are mandatory in Popart: No input found for input 1 of Op(ai.graphcore.GroupNormalization:1, inputs=[Flatten:0], outputs=[]), but input is not optional",
    "test_nn_GroupNorm_2d_no_affine_LN": "Weights & bias are mandatory in Popart: No input found for input 1 of Op(ai.graphcore.GroupNormalization:1, inputs=[Flatten:0], outputs=[]), but input is not optional",

    "test_nn_interpolate_nearest_1d_zero_dim": "Zero-sized tensors are unsupported",
    "test_nn_interpolate_nearest_2d_zero_dim": "Zero-sized tensors are unsupported",
    "test_nn_interpolate_nearest_3d_zero_dim": "Zero-sized tensors are unsupported",

    "test_nn_CrossMapLRN2d": "Broadcasting failed",
    "test_nn_PReLU_1d_multiparam": "Broadcasting failed",
    "test_nn_PReLU_2d_multiparam": "Broadcasting failed",
    "test_nn_PReLU_3d_multiparam": "Broadcasting failed",

    "test_nn_BatchNorm1d_3d_input_not_affine": "No input found for input 1 of ai.onnx.BatchNormalization",
    "test_nn_BatchNorm1d_not_affine": "No input found for input 1 of ai.onnx.BatchNormalization",
    "test_nn_BatchNorm2d_not_affine": "No input found for input 1 of ai.onnx.BatchNormalization",

    "test_nn_GroupNorm_1d_affine": "margin of error",
    "test_nn_LayerNorm_1d_empty_elementwise_affine": "std::out_of_range exception",

    "test_nn_Conv1d_zero_batch": "StepIO did not provide input data for tensor input",
    "test_nn_Conv2d_zero_batch": "StepIO did not provide input data for tensor input",
    "test_nn_Conv3d_zero_batch": "StepIO did not provide input data for tensor input",
    "test_nn_ConvTranspose1d_dilated": "Popart exception format",
    "test_nn_ConvTranspose2d_dilated": "Popart exception format",
    "test_nn_ConvTranspose3d_dilated": "Popart exception format",
    "test_nn_MaxPool2d_3d_input": "Invalid length of strides vector",
    "test_nn_LPPool2d_norm": "Invalid length of padding vector",

    "test_nn_AdaptiveAvgPool1d": "margin of error",
    "test_nn_AdaptiveAvgPool1d_one_output": "margin of error",
    "test_nn_AdaptiveAvgPool2d_single": "margin of error",
    "test_nn_AdaptiveAvgPool2d_tuple": "margin of error",
    "test_nn_AdaptiveAvgPool2d_tuple_none": "margin of error",
    "test_nn_AdaptiveAvgPool2d_alert_nondeterministic": "margin of error",
    "test_nn_AdaptiveAvgPool3d_single": "margin of error",
    "test_nn_AdaptiveAvgPool3d_tuple": "margin of error",
    "test_nn_AdaptiveAvgPool3d_tuple_none": "margin of error",
    "test_nn_AdaptiveAvgPool3d_alert_nondeterministic": "margin of error",

    "test_nn_Conv3d_circular_stride2_pad2": "hangs ? really slow ?",
    "test_nn_Padding122112_3dcircular": "hangs ? really slow ?",
    "test_nn_Padding322112_3dcircular": "hangs ? really slow ?",
    "test_nn_Padding332122_3dcircular": "hangs ? really slow ?",

    "test_nn_softmax_spatial_dtype": "AssertionError: With rtol=0.0001 and atol=1e-05, found 64 element(s) (out of 64) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.662305723875761 (0.706894040107727 vs. 0.04458831623196602), which occurred at index (0, 1, 2, 3).",
    "test_nn_Softmin_multidim": "AssertionError: With rtol=0.0001 and atol=1e-05, found 300 element(s) (out of 300) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.528009258210659 (0.5376919507980347 vs. 0.009682692587375641), which occurred at index (0, 1, 4, 6).",

    "test_nn_GroupNorm_2d_affine": "AssertionError: With rtol=0.0001 and atol=1e-05, found 144 element(s) (out of 144) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 1.3549566268920898 (2.1383447647094727 vs. 0.7833881378173828), which occurred at index (2, 2, 1, 1).",
    "test_nn_AvgPool1d_stride_pad": "AssertionError: With rtol=0.0001 and atol=1e-05, found 12 element(s) (out of 24) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.39088454842567444 (0.39088454842567444 vs. 0.7817690968513489), which occurred at index (0, 2, 3).",
    "test_nn_AvgPool2d_stride_pad": "AssertionError: With rtol=0.0001 and atol=1e-05, found 72 element(s) (out of 96) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.7429442256689072 (0.24764807522296906 vs. 0.9905923008918762), which occurred at index (0, 1, 0, 0).",
    "test_nn_AvgPool2d_divisor": "AssertionError: With rtol=0.0001 and atol=1e-05, found 54 element(s) (out of 54) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 2.755292057991028 (3.673722743988037 vs. 0.9184306859970093), which occurred at index (1, 2, 2, 2).",
    "test_nn_AvgPool2d_divisor_stride": "AssertionError: With rtol=0.0001 and atol=1e-05, found 54 element(s) (out of 54) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 2.236291766166687 (2.981722354888916 vs. 0.745430588722229), which occurred at index (0, 1, 2, 2).",
    "test_nn_AvgPool2d_divisor_stride_pad": "AssertionError: With rtol=0.0001 and atol=1e-05, found 72 element(s) (out of 96) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 2.427279531955719 (3.236372709274292 vs. 0.809093177318573), which occurred at index (0, 0, 1, 2).",
    "test_nn_AvgPool3d_stride_pad": "AssertionError: With rtol=0.0001 and atol=1e-05, found 114 element(s) (out of 162) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.6763140857219696 (0.22543802857398987 vs. 0.9017521142959595), which occurred at index (0, 2, 2, 0, 0).",
    "test_nn_AvgPool3d_stride_pad_gpu_fixedkw_output": "AssertionError: With rtol=0.0001 and atol=1e-05, found 66 element(s) (out of 72) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.3809252828359604 (0.22855515778064728 vs. 0.6094804406166077), which occurred at index (1, 1, 0, 0, 1).",
    "test_nn_AvgPool3d_stride_pad_gpu_general_output": "AssertionError: With rtol=0.0001 and atol=1e-05, found 264 element(s) (out of 270) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.4184434115886688 (0.16373875737190247 vs. 0.5821821689605713), which occurred at index (0, 2, 2, 0, 4).",
    "test_nn_AvgPool3d_stride_pad_gpu_input_nooverlap": "AssertionError: With rtol=0.0001 and atol=1e-05, found 156 element(s) (out of 162) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.8563244119286537 (0.12233205884695053 vs. 0.9786564707756042), which occurred at index (0, 0, 2, 2, 0).",
    "test_nn_AvgPool3d_divisor": "AssertionError: With rtol=0.0001 and atol=1e-05, found 48 element(s) (out of 48) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 5.214784324169159 (5.959753513336182 vs. 0.7449691891670227), which occurred at index (0, 2, 0, 0, 0).",
    "test_nn_AvgPool3d_divisor_stride": "AssertionError: With rtol=0.0001 and atol=1e-05, found 48 element(s) (out of 48) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 4.80545711517334 (5.491950988769531 vs. 0.6864938735961914), which occurred at index (1, 1, 1, 1, 0).",
    "test_nn_AvgPool3d_divisor_stride_pad": "AssertionError: With rtol=0.0001 and atol=1e-05, found 156 element(s) (out of 162) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 4.580634713172913 (5.235011100769043 vs. 0.6543763875961304), which occurred at index (1, 2, 1, 1, 1).",
    "test_nn_AvgPool3d_divisor_stride_pad_gpu_fixedkw_output": "AssertionError: With rtol=0.0001 and atol=1e-05, found 72 element(s) (out of 72) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 33.63082355260849 (34.16464614868164 vs. 0.5338225960731506), which occurred at index (1, 1, 1, 1, 1).",
    "test_nn_AvgPool3d_divisor_stride_pad_gpu_general_output": "AssertionError: With rtol=0.0001 and atol=1e-05, found 270 element(s) (out of 270) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 32.37348711490631 (32.887351989746094 vs. 0.5138648748397827), which occurred at index (1, 1, 1, 1, 2).",
    "test_nn_AvgPool3d_divisor_stride1_pad0_gpu_input": "AssertionError: With rtol=0.0001 and atol=1e-05, found 48 element(s) (out of 48) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 15.688140630722046 (16.29153060913086 vs. 0.6033899784088135), which occurred at index (0, 2, 0, 0, 0).",
    "test_nn_AvgPool3d_divisor_stride_pad_gpu_input_nooverlap": "AssertionError: With rtol=0.0001 and atol=1e-05, found 114 element(s) (out of 162) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 5.383516907691956 (6.152590751647949 vs. 0.7690738439559937), which occurred at index (0, 2, 1, 1, 1).",
    "test_nn_GELU": "AssertionError: With rtol=0.0001 and atol=1e-05, found 7 element(s) (out of 30) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.00013697147369384766 (0.7933593392372131 vs. 0.7932223677635193), which occurred at index (0, 1, 3).",
    "test_nn_GELU_no_batch_dim": "AssertionError: Tensor-likes are not close",
    "test_nn_softmax_spatial_special": "AssertionError: With rtol=0.0001 and atol=1e-05, found 1024 element(s) (out of 1024) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.009421411203220487 (0.012496765702962875 vs. 0.0030753544997423887), which occurred at index (0, 114, 0, 0).",
    "test_nn_softmax_spatial": "AssertionError: With rtol=0.0001 and atol=1e-05, found 64 element(s) (out of 64) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.6575267650187016 (0.7020591497421265 vs. 0.04453238472342491), which occurred at index (1, 0, 1, 0).",
    "test_nn_softmax_functional_dim0": "AssertionError: With rtol=0.0001 and atol=1e-05, found 120 element(s) (out of 120) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.7021725764498115 (0.7146565914154053 vs. 0.012484014965593815), which occurred at index (0, 1, 2, 1).",
    "test_nn_log_softmax_spatial_special": "AssertionError: With rtol=0.0001 and atol=1e-05, found 1024 element(s) (out of 1024) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 1.4219598770141602 (-4.52596378326416 vs. -5.94792366027832), which occurred at index (0, 127, 1, 0).",
    "test_nn_log_softmax_spatial": "AssertionError: With rtol=0.0001 and atol=1e-05, found 64 element(s) (out of 64) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 3.2339431643486023 (-0.6880427002906799 vs. -3.9219858646392822), which occurred at index (1, 0, 2, 0).",
    "test_nn_log_softmax_dim0": "AssertionError: With rtol=0.0001 and atol=1e-05, found 120 element(s) (out of 120) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 4.541183948516846 (-0.645355224609375 vs. -5.186539173126221), which occurred at index (0, 0, 2, 0).",
    "test_nn_Softmax2d": "AssertionError: With rtol=0.0001 and atol=1e-05, found 600 element(s) (out of 600) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.5474691272247583 (0.5498999953269958 vs. 0.002430868102237582), which occurred at index (0, 1, 6, 13).",
    "test_nn_LogSoftmax_multiparam": "AssertionError: With rtol=0.0001 and atol=1e-05, found 600 element(s) (out of 600) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 5.692737579345703 (-1.065810203552246 vs. -6.758547782897949), which occurred at index (0, 2, 7, 2).",
    "test_nn_TransformerEncoderLayer_gelu_activation": "AssertionError: With rtol=0.0001 and atol=1e-05, found 2 element(s) (out of 24) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 6.428360939025879e-05 (-0.4767186641693115 vs. -0.47665438055992126), which occurred at index (1, 2, 0).",
    "test_nn_TransformerDecoderLayer_gelu_activation": "AssertionError: With rtol=0.0001 and atol=1e-05, found 3 element(s) (out of 36) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.00011313706636428833 (-0.07721851021051407 vs. -0.07710537314414978), which occurred at index (0, 0, 3).",

    "test_nn_LPPool1d_norm": "Output dimensions mismatch: assert torch.Size([1, 3, 3]) == torch.Size([1, 3, 6])",
    "test_nn_ConvTranspose1d": "Output dimensions mismatch: assert torch.Size([1, 4, 20]) == torch.Size([1, 4, 19])",
    "test_nn_ConvTranspose1d_no_bias": "Output dimensions mismatch: assert torch.Size([1, 4, 12]) == torch.Size([1, 4, 11])",
    "test_nn_ConvTranspose1d_groups": "Output dimensions mismatch: assert torch.Size([2, 6, 20]) == torch.Size([2, 6, 19])",
    "test_nn_ConvTranspose2d": "Output dimensions mismatch: assert torch.Size([1, 4, 20, 12]) == torch.Size([1, 4, 19, 11])",
    "test_nn_ConvTranspose2d_no_bias": "Output dimensions mismatch: assert torch.Size([1, 4, 12, 20]) == torch.Size([1, 4, 11, 19])",

    "test_nn_BCELoss_weights_no_reduce": "RuntimeError: expected int at position 0, but got: Tensor",
    "test_nn_Bilinear": "TypeError: bilinear(): argument 'input2' (position 2) must be Tensor, not tuple",
    "test_nn_Embedding": "RuntimeError: Expected tensor for argument #1 'indices' to have scalar type Long; but got torch.FloatTensor instead (while checking arguments for embedding)",

    "test_nn_BCELoss_weights_no_reduce_scalar": "IndexError: tuple index out of range",
    "test_nn_BCEWithLogitsLoss_no_reduce_scalar": "IndexError: tuple index out of range",
    "test_nn_KLDivLoss_no_reduce_scalar": "IndexError: tuple index out of range",
    "test_nn_KLDivLoss_no_reduce_scalar_log_target": "IndexError: tuple index out of range",
    "test_nn_L1Loss_no_reduce_scalar": "IndexError: tuple index out of range",
    "test_nn_MSELoss_no_reduce_scalar": "IndexError: tuple index out of range",
    "test_nn_SmoothL1Loss_no_reduce_scalar": "IndexError: tuple index out of range",
    "test_nn_MultiLabelMarginLoss_0d_no_reduce": "IndexError: tuple index out of range",
    "test_nn_BCELoss_no_reduce_scalar": "IndexError: tuple index out of range",
    "test_nn_softmax_functional_scalar": "IndexError: tuple index out of range",
    "test_nn_SELU_scalar": "IndexError: tuple index out of range",
    "test_nn_CELU_scalar": "IndexError: tuple index out of range",
    "test_nn_GELU_scalar": "IndexError: tuple index out of range",
    "test_nn_log_softmax_scalar": "IndexError: tuple index out of range",
    "test_nn_Threshold_threshold_value_scalar": "IndexError: tuple index out of range",
    "test_nn_ReLU_scalar": "IndexError: tuple index out of range",
    "test_nn_ReLU6_scalar": "IndexError: tuple index out of range",
    "test_nn_RReLU_with_up_down_scalar": "IndexError: tuple index out of range",
    "test_nn_Hardtanh_scalar": "IndexError: tuple index out of range",
    "test_nn_Sigmoid_scalar": "IndexError: tuple index out of range",
    "test_nn_Tanh_scalar": "IndexError: tuple index out of range",
    "test_nn_Softmax_scalar": "IndexError: tuple index out of range",
    "test_nn_LogSoftmax_multiparam_scalar": "IndexError: tuple index out of range",
    "test_nn_ELU_scalar": "IndexError: tuple index out of range",
    "test_nn_Hardshrink_scalar": "IndexError: tuple index out of range",
    "test_nn_LeakyReLU_with_negval_scalar": "IndexError: tuple index out of range",
    "test_nn_LogSigmoid_scalar": "IndexError: tuple index out of range",
    "test_nn_Softplus_beta_threshold_scalar": "IndexError: tuple index out of range",
    "test_nn_Softshrink_lambda_scalar": "IndexError: tuple index out of range",
    "test_nn_PReLU_scalar": "IndexError: tuple index out of range",
    "test_nn_Softsign_scalar": "IndexError: tuple index out of range",
    "test_nn_Softmin_scalar": "IndexError: tuple index out of range",
    "test_nn_Tanhshrink_scalar": "IndexError: tuple index out of range",
    "test_nn_SiLU_scalar": "IndexError: tuple index out of range",

    # input 1 for ai.graphcore.BatchNormalization
    "test_nn_BatchNorm2d_zero_batch": "ERROR in NormalizationOps.cpp:98: weight->type()->cast<c10::TensorType>() == nullptr Context: PopartCanonicalization processing %10 : Float(0:20, 5:4, 2:2, 2:1) = aten::batch_norm(%input, %4, %5, %11, %12, %13, %14, %15, %16)",
    "test_nn_BatchNorm1d_zero_batch": "RuntimeError: ERROR in NormalizationOps.cpp:98: weight->type()->cast<c10::TensorType>() == nullptr Context: PopartCanonicalization processing %10 : Float(0:45, 5:9, 9:1) = aten::batch_norm(%input, %4, %5, %11, %12, %13, %14, %15, %16)",
    "test_nn_BatchNorm3d_zero_batch": "RuntimeError: ERROR in NormalizationOps.cpp:98: weight->type()->cast<c10::TensorType>() == nullptr Context: PopartCanonicalization processing %10 : Float(0:40, 5:8, 2:4, 2:2, 2:1) = aten::batch_norm(%input, %4, %5, %11, %12, %13, %14, %15, %16)",

    # margin of error
    "test_nn_KLDivLoss_no_reduce": "Unsupported op(s): aten::kl_div aten::kl_div",

    "test_nn_FractionalMaxPool2d_alert_nondeterministic": "T30594",
    "test_nn_FractionalMaxPool2d_ratio": "T30594",
    "test_nn_FractionalMaxPool2d_ratio_no_batch_dim": "T30594",
    "test_nn_FractionalMaxPool2d_ratio_no_batch_dim_no_random_samples": "T30594",
    "test_nn_FractionalMaxPool2d_ratio_return_indices": "T30594",
    "test_nn_FractionalMaxPool2d_size": "T30594",
    "test_nn_FractionalMaxPool2d_size_no_batch_dim": "T30594",
    "test_nn_FractionalMaxPool2d_size_no_batch_dim_no_random_samples": "T30594",
    "test_nn_FractionalMaxPool3d_alert_nondeterministic": "T30594",
    "test_nn_FractionalMaxPool3d_asymsize": "T30594",
    "test_nn_FractionalMaxPool3d_ratio": "T30594",
    "test_nn_FractionalMaxPool3d_ratio_no_batch_dim": "T30594",
    "test_nn_FractionalMaxPool3d_ratio_no_batch_dim_no_random_samples": "T30594",
    "test_nn_FractionalMaxPool3d_ratio_return_indices": "T30594",
    "test_nn_FractionalMaxPool3d_size": "T30594",
    "test_nn_FractionalMaxPool3d_size_no_batch_dim": "T30594",
    "test_nn_FractionalMaxPool3d_size_no_batch_dim_no_random_samples": "T30594",

    "test_nn_BCELoss_no_reduce": "T30603",
    "test_nn_BCEWithLogitsLoss_no_reduce": "T30603",
    "test_nn_NLLLoss_no_reduce_ignore_index": "T30603",
    "test_nn_NLLLoss_no_reduce_weights": "T30603",
    "test_nn_NLLLoss_no_reduce_weights_ignore_index": "T30603",
    "test_nn_NLLLoss_no_reduce_weights_ignore_index_neg": "T30603",
    "test_nn_NLLLoss2d_no_reduce": "T30603",
    "test_nn_NLLLoss2d_no_reduce_weights": "T30603",
    "test_nn_NLLLoss2d_no_reduce_ignore_index": "T30603",
    "test_nn_NLLLossNd_no_reduce": "T30603",
    "test_nn_NLLLossNd_no_reduce_weights": "T30603",
    "test_nn_NLLLossNd_no_reduce_ignore_index": "T30603",
    "test_nn_MultiLabelMarginLoss_index_neg": "T30603",
    "test_nn_MultiLabelMarginLoss_no_reduce": "T30603",
    "test_nn_HingeEmbeddingLoss_no_reduce": "T30603",
    "test_nn_HingeEmbeddingLoss_margin_no_reduce": "T30603",
    "test_nn_MultiMarginLoss_no_reduce": "T30603",
    "test_nn_MultiMarginLoss_1d_no_reduce": "T30603",
    "test_nn_multimarginloss_1d_input_0d_target_no_reduce": "T30603",
    "test_nn_MultiMarginLoss_p_no_reduce": "T30603",
    "test_nn_MultiMarginLoss_margin_no_reduce": "T30603",
    "test_nn_MultiMarginLoss_weights_no_reduce": "T30603",
    "test_nn_MultiLabelMarginLoss_1d_no_reduce": "T30603",
    "test_nn_AdaptiveMaxPool3d_single": "T30564",

    "test_nn_MaxPool1d_return_indices": "Max pool return indices not supported.",
    "test_nn_MaxPool2d_return_indices": "Max pool return indices not supported.",
    "test_nn_MaxPool3d_return_indices": "Max pool return indices not supported.",

    # TODO(T30564): Support adaptive max pool
    "test_nn_AdaptiveMaxPool1d": "T30564",
    "test_nn_AdaptiveMaxPool1d_no_batch_dim": "T30564",
    "test_nn_AdaptiveMaxPool2d_single": "T30564",
    "test_nn_AdaptiveMaxPool2d_tuple": "T30564",
    "test_nn_AdaptiveMaxPool2d_tuple_none": "T30564",
    "test_nn_AdaptiveMaxPool2d_alert_nondeterministic": "T30564",
    "test_nn_AdaptiveMaxPool2d_no_batch_dim": "T30564",
    "test_nn_AdaptiveMaxPool3d_no_batch_dim": "T30564",
    "test_nn_AdaptiveMaxPool3d_tuple": "T30564",
    "test_nn_AdaptiveMaxPool3d_tuple_none": "T30564",
    "test_nn_AdaptiveMaxPool3d_single_nonatomic": "T30564",
    "test_nn_AdaptiveMaxPool3d_tuple_nonatomic": "T30564",

    # Input dims indivisible by output dims, output doesn't match torch
    "test_nn_AdaptiveAvgPool1d_no_batch_dim": "Output differs from torch due to implementation detail",
    "test_nn_AdaptiveAvgPool2d_no_batch_dim": "Output differs from torch due to implementation detail",
    "test_nn_AdaptiveAvgPool3d_no_batch_dim": "Output differs from torch due to implementation detail",

    # torch.complex128 not supported
    "test_nn_L1Loss_no_reduce_complex": "torch.complex128 not supported",
    "test_nn_ReflectionPad1d_complex": "torch.complex128 not supported",
    "test_nn_ReflectionPad2d_complex": "torch.complex128 not supported",
    "test_nn_ReflectionPad3d_complex": "torch.complex128 not supported",
    "test_nn_ReplicationPad1d_complex": "torch.complex128 not supported",
    "test_nn_ReplicationPad2d_complex": "torch.complex128 not supported",
    "test_nn_ConstantPad1d_complex": "torch.complex128 not supported",
    "test_nn_ConstantPad2d_complex": "torch.complex128 not supported",
    "test_nn_ConstantPad3d_complex": "torch.complex128 not supported",
    "test_nn_ReplicationPad3d_complex": "torch.complex128 not supported",
    "test_nn_ZeroPad2d_complex": "torch.complex128 not supported",

    # TODO(T42768): Support aten::_convolution_mode
    "test_nn_Conv1d_pad_valid": "T42768",
    "test_nn_Conv1d_pad_same": "T42768",
    "test_nn_Conv1d_pad_same2": "T42768",
    "test_nn_Conv1d_pad_same_dilated": "T42768",
    "test_nn_Conv2d_pad_valid": "T42768",
    "test_nn_Conv2d_pad_same": "T42768",
    "test_nn_Conv2d_pad_same_dilated": "T42768",
    "test_nn_Conv3d_pad_valid": "T42768",
    "test_nn_Conv3d_pad_same": "T42768",
    "test_nn_Conv3d_pad_same_dilated": "T42768",

    # TODO(T42770): Support torch.nn.HuberLoss
    "test_nn_HuberLoss_delta": "T42770",

    # TODO(T42771): Support torch.nn.PixelUnshuffle
    "test_nn_PixelUnshuffle": "T42771",

    # TODO(T42772): Support torch.nn.Mish
    "test_nn_Mish": "T42772",
    "test_nn_Mish_scalar": "T42772",
    "test_nn_Mish_no_batch_dim": "T42772",

    # TODO(T48781): Support torch.nn.Unflatten
    "test_nn_Unflatten_no_batch_dim": "T48781",

    # TODO(T49021): Support torch.nn.ReflectionPad3d
    "test_nn_ReflectionPad3d": "T49021",
    "test_nn_ReflectionPad3d_no_batch_dim": "T49021",

    # TODO(T49073): Match torch 1.10 implementation
    "test_nn_GroupNorm_1d_affine_large_batch": "T49073",

    # unsupported upsampling modes downstream
    "test_nn_interpolate_linear_1d": "Upsample mode not supported",
    "test_nn_interpolate_linear_tuple_1d": "Upsample mode not supported",
    "test_nn_interpolate_linear_scale_1d": "Upsample mode not supported",
    "test_nn_interpolate_linear_1d_zero_dim": "Upsample mode not supported",
    "test_nn_interpolate_linear_1d_align_corners": "Upsample mode not supported",
    "test_nn_interpolate_linear_scale_1d_align_corners": "Upsample mode not supported",
    "test_nn_interpolate_linear_1d_alert_nondeterministic": "Upsample mode not supported",
    "test_nn_interpolate_bilinear_2d_zero_dim": "Upsample mode not supported",
    "test_nn_interpolate_bilinear_scale_tuple_skewed_2d": "Upsample mode not supported",
    "test_nn_interpolate_bilinear_tuple_2d_align_corners": "Upsample mode not supported",
    "test_nn_interpolate_bilinear_scale_tuple_skewed_2d_align_corners": "Upsample mode not supported",
    "test_nn_interpolate_bilinear_2d_alert_nondeterministic": "Upsample mode not supported",
    "test_nn_interpolate_bicubic_2d": "Upsample mode not supported",
    "test_nn_interpolate_bicubic_2d_zero_dim": "Upsample mode not supported",
    "test_nn_interpolate_bicubic_tuple_2d": "Upsample mode not supported",
    "test_nn_interpolate_bicubic_scale_2d": "Upsample mode not supported",
    "test_nn_interpolate_bicubic_scale_tuple_shared_2d": "Upsample mode not supported",
    "test_nn_interpolate_bicubic_scale_tuple_skewed_2d": "Upsample mode not supported",
    "test_nn_interpolate_bicubic_tuple_2d_align_corners": "Upsample mode not supported",
    "test_nn_interpolate_bicubic_scale_tuple_skewed_2d_align_corners": "Upsample mode not supported",
    "test_nn_interpolate_bicubic_2d_alert_nondeterministic": "Upsample mode not supported",
    "test_nn_interpolate_trilinear_3d": "Upsample mode not supported",
    "test_nn_interpolate_trilinear_3d_zero_dim": "Upsample mode not supported",
    "test_nn_interpolate_trilinear_tuple_3d": "Upsample mode not supported",
    "test_nn_interpolate_trilinear_scale_3d": "Upsample mode not supported",
    "test_nn_interpolate_trilinear_tuple_3d_align_corners": "Upsample mode not supported",
    "test_nn_interpolate_trilinear_scale_3d_align_corners": "Upsample mode not supported",
    "test_nn_interpolate_trilinear_3d_alert_nondeterministic": "Upsample mode not supported",

    "test_nn_EmbeddingBag_sparse": "T27057: sparse gradient support",
    "test_nn_Embedding_sparse": "T27057: sparse gradient support",

    "test_nn_MultiheadAttention": "ai.onnx.Dropout:10 ratio value 0 is not valid",

    "test_nn_EmbeddingBag_sum_padding_idx": "padding_idx not supported",
    "test_nn_EmbeddingBag_mean_padding_idx": "padding_idx not supported",
    "test_nn_EmbeddingBag_max_padding_idx": "padding_idx not supported",
}

FLOAT_EXPECTED_FAILURES = {
    # Tests that fail on float only, e.g. due to OOM on the small IPU model
    "test_nn_LayerNorm_3d_no_affine_large_feature": "Tile 0 receives more data than it has total memory in exchange",
}

HALF_EXPECTED_FAILURES = {
    # T30731 - tests failing with very large error
    "test_nn_BatchNorm1d_affine_simple_average": "AssertionError: With rtol=0.05 and atol=0.0001",
    "test_nn_BatchNorm1d_not_tracking_stats": "Tensor-likes are not close",
    "test_nn_BatchNorm2d_momentum": "AssertionError: With rtol=0.05 and atol=0.0001",
    "test_nn_BatchNorm3d_momentum": "AssertionError: With rtol=0.05 and atol=0.0001",
    "test_nn_BatchNorm1d_3d_input": "AssertionError: With rtol=0.05 and atol=0.0001",
    "test_nn_BatchNorm2d_2d_simple_average": "AssertionError: With rtol=0.05 and atol=0.0001",
    "test_nn_BatchNorm1d_affine": "AssertionError: With rtol=0.05 and atol=0.0001",
    "test_nn_BatchNorm2d": "AssertionError: With rtol=0.05 and atol=0.0001",
    "test_nn_BatchNorm3d": "AssertionError: With rtol=0.05 and atol=0.0001, found 384 element(s) (out of 384) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 312.5105660557747 (0.9894339442253113 vs. 313.5), which occurred at index (1, 2, 2, 1, 1).",
    "test_nn_BatchNorm3d_3d_simple_average": "AssertionError: With rtol=0.05 and atol=0.0001, found 384 element(s) (out of 384) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 30.140776455402374 (0.9842235445976257 vs. 31.125), which occurred at index (1, 0, 2, 2, 3).",
    "test_nn_LSTMCell": "Exception: TEXCPT_INVALID_ADDR",
}

FLOAT_PRECISION_EXCEPTIONS = {
    "test_nn_GroupNorm_2d_affine_large_feature": (1e-3, 1e-3),
    "test_nn_GroupNorm_2d_no_affine_large_feature": (1e-3, 1e-3),
}

HALF_PRECISION_EXCEPTIONS = {
    "test_nn_BatchNorm2d_not_tracking_stats": (0.05, 1e-3),
    "test_nn_BatchNorm3d_not_tracking_stats": (0.05, 1e-2),
    "test_nn_Conv1d_dilated": (0.05, 1e-3),
    "test_nn_Conv1d_pad2": (0.05, 1e-3),
    "test_nn_Conv2d_depthwise_padded": (0.05, 1e-3), # TODO(T31811)?
    "test_nn_Conv2d_groups": (0.05, 1e-3),
    "test_nn_Conv2d_groups_thnn": (0.05, 1e-3),
    "test_nn_Conv2d_replicate_stride2_pad2": (0.05, 1e-3),
    "test_nn_Conv3d_dilated": (0.05, 1e-3),
    "test_nn_Conv3d_groups": (0.05, 1e-3),
    "test_nn_GroupNorm_2d_affine_large_feature": (0.05, 1e-2),
    "test_nn_GroupNorm_2d_no_affine_large_feature": (0.05, 1e-2),
    "test_nn_InstanceNorm1d": (0.05, 1e-3),
    "test_nn_InstanceNorm1d_no_batch_dim": (0.05, 1e-3),
    "test_nn_InstanceNorm3d_no_batch_dim": (0.05, 1e-3),
    "test_nn_KLDivLoss_with_target_no_reduce": (0.05, 1e-2),
    "test_nn_LayerNorm_1d_elementwise_affine": (0.05, 0.002),
    "test_nn_LayerNorm_3d_elementwise_affine": (0.05, 0.002),
    "test_nn_LayerNorm_3d_no_affine_large_feature": (0.05, 0.002),
    "test_nn_Linear_no_bias": (0.05, 1e-3),
    "test_nn_TransformerDecoderLayer_relu_activation": (0.05, 1e-2),
    "test_nn_Transformer_multilayer_coder": (0.05, 1e-2),
}

# pylint: enable=line-too-long
# yapf: enable

all_tests = {}
# Inspired from torch/testing/_internal/jit_metaprogramming_utils.py
for test in get_all_nn_module_tests():
    test_name = get_nn_mod_test_name(**test)

    name = get_nn_module_name_from_kwargs(**test)
    if "constructor_args_fn" in test:
        args = test["constructor_args_fn"]()
    else:
        args = test.get("constructor_args", ())

    if "constructor" in test:
        module = test["constructor"](*args)
    else:
        module = getattr(torch.nn, name)(*args)

    module.eval()

    if 'input_fn' in test:
        input = test['input_fn']()
    elif "input" in test:
        input = (test.get("input"), )
    else:
        input = (torch.rand(test['input_size'], dtype=torch.float), )
    if 'extra_args' in test:
        input = input + test['extra_args']
    if 'target_size' in test:
        input = input + (test['target_size'], )
    elif 'target_fn' in test:
        input = input + (test['target_fn'](), )

    if not isinstance(input, tuple):
        input = (input, )

    assert test_name not in all_tests
    all_tests[test_name] = (module, input)


def assert_allclose(native_out, poptorch_out, rtol, atol):

    if isinstance(native_out, tuple):
        assert isinstance(poptorch_out, tuple)
        for idx, native_out_t in enumerate(native_out):
            assert_allclose(native_out_t, poptorch_out[idx], rtol, atol)
        return

    if native_out.size() == tuple():
        native_out = torch.tensor(native_out.float())

    assert native_out.size() == poptorch_out.size()
    helpers.assert_allclose(expected=native_out.float(),
                            actual=poptorch_out.float(),
                            rtol=rtol,
                            atol=atol)


@pytest.mark.parametrize("test_name", all_tests.keys())
@pytest.mark.parametrize("use_half", [False, True])
def test_pytorch_nn(test_name, use_half):

    reason = EXPECTED_FAILURES.get(test_name)
    if reason is None:
        reason = HALF_EXPECTED_FAILURES.get(
            test_name) if use_half else FLOAT_EXPECTED_FAILURES.get(test_name)
    if reason:
        pytest.skip(reason)

    print(f"Running {test_name}", flush=True)
    model, inputs = all_tests[test_name]
    model = model.float()
    inputs = [
        i.float()
        if isinstance(i, torch.Tensor) and i.is_floating_point() else i
        for i in inputs
    ]

    ref = model(*inputs)
    rtol = None
    atol = None

    if use_half:
        model = model.half()
        inputs = [
            i.half()
            if isinstance(i, torch.Tensor) and i.is_floating_point() else i
            for i in inputs
        ]
        rtol, atol = HALF_PRECISION_EXCEPTIONS.get(test_name, (0.05, 1e-4))
    else:
        rtol, atol = FLOAT_PRECISION_EXCEPTIONS.get(test_name, (None, None))

    poptorch_model = poptorch.inferenceModel(model)
    poptorch_out = poptorch_model(*inputs)

    assert_allclose(ref, poptorch_out, rtol, atol)


if __name__ == "__main__":
    assert len(sys.argv) >= 2, f"Usage {sys.argv[0]} (test_name)+"

    # Disable expected failures:
    EXPECTED_FAILURES.clear()
    HALF_EXPECTED_FAILURES.clear()

    if len(sys.argv) == 2:
        test_pytorch_nn(sys.argv[1], os.environ.get("HALF", "0") == "1")
        sys.exit(0)

    fails = []
    for testname in sys.argv[1:]:
        try:
            test_pytorch_nn(testname, os.environ.get("HALF", "0") == "1")
        except (RuntimeError, AssertionError, poptorch.Error):
            fails.append(testname)

    if len(fails) > 0:
        print("\nFailed Tests:")
    for fail in fails:
        print("\t" + fail)

    sys.exit(len(fails))


================================================
FILE: tests/torchvision_inference_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import os  # pylint: disable=unused-import
import unittest.mock
import pytest
import torch
import torchvision.models as models
import helpers
import poptorch

# Torchvision models.
# AlexNet
# VGG-11
# VGG-13
# VGG-16
# VGG-19
# VGG-11 with batch normalization
# VGG-13 with batch normalization
# VGG-16 with batch normalization
# VGG-19 with batch normalization
# ResNet-18
# ResNet-34
# ResNet-50
# ResNet-101
# ResNet-152
# SqueezeNet 1.0
# SqueezeNet 1.1
# Densenet-121
# Densenet-169
# Densenet-201
# Densenet-161
# Inception v3
# GoogleNet
# ShuffleNet V2
# MobileNet V2
# ResNeXt-50-32x4d
# ResNeXt-101-32x8d
# Wide ResNet-50-2
# Wide ResNet-101-2
# MNASNet 1.0

# Models here are hopefully representative of their cousins (i.e test Resnet18 without testing Resnet-34/50/101/152)
# The others will be tested in hardware benchmark tests,
tested_models = [
    models.resnet18,
    models.resnext50_32x4d,
    models.mnasnet1_0,
    models.mobilenet_v2,
    models.googlenet,
    models.inception_v3,
    # SqueezeNet v1.0 simply has more parameters and a greater computational cost
    models.squeezenet1_1,
]

# Deliberately un-tested models
untested_models = [
    models.vgg11,  # Supported but takes a long time to compile.
    models.shufflenet_v2_x1_0,  # Supported but takes a long time to compile.
    models.densenet121,  # Supported but takes a long time to compile.
    models.wide_resnet50_2,  # Supported but doesn't fit on 1 IPU.
    # Supported on IPU_MODEL but runs into stream limit on IPU.
    models.alexnet,
]


def inference_harness(imagenet_model):
    torch.manual_seed(42)

    image_input = torch.randn([1, 3, 224, 224])

    # We are running on a dummy input so it doesn't matter if the weights are trained.
    model = imagenet_model(pretrained=False)
    model.eval()

    # Run on CPU.
    native_out = model(image_input)

    poptorch_model = poptorch.inferenceModel(model)

    poptorch_out = poptorch_model(image_input)

    helpers.assert_allclose(expected=native_out,
                            actual=poptorch_out,
                            atol=1e-05,
                            rtol=0.1)

    native_class = torch.topk(torch.softmax(native_out, 1), 5)
    pop_class = torch.topk(torch.softmax(poptorch_out, 1), 5)

    helpers.assert_allequal(expected=native_class.indices,
                            actual=pop_class.indices)
    helpers.assert_allclose(expected=native_class.values,
                            actual=pop_class.values)


@unittest.mock.patch.dict("os.environ", helpers.disableSmallModel())
@pytest.mark.parametrize("model", tested_models + untested_models)
@pytest.mark.extendedTestingOnly
def test_model(model):
    if model in untested_models:
        pytest.skip("Model not currently tested")
    inference_harness(model)


================================================
FILE: tests/type_support_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import numpy as np
import torch
import torch.nn as nn
import pytest
import helpers
import poptorch

MANY_TYPES = (torch.float16, torch.float32, torch.float64, torch.int32,
              torch.int64)

DEMOTED_ON_IPU = (torch.float64, torch.int64)


def get_simple_adder(return_type):
    class SimpleAdder(nn.Module):
        def forward(self, x, y):
            return (x + y).type(return_type)

    return poptorch.inferenceModel(SimpleAdder())


@pytest.mark.parametrize("input_type", MANY_TYPES)
@pytest.mark.parametrize("output_type", MANY_TYPES)
def test_many_input_output_types(input_type, output_type):
    model = get_simple_adder(output_type)
    t1 = torch.tensor([1.0, 25, -1.0, 83], dtype=input_type)
    t2 = torch.tensor([2.0, 35, 1.0, 32.4], dtype=input_type)

    output = model(t1, t2)

    if output_type not in DEMOTED_ON_IPU:
        assert output[0].dtype == output_type
        assert output[1].dtype == output_type

    helpers.assert_allclose(actual=output,
                            expected=torch.tensor([3., 60., 0., 115.4]),
                            atol=0.5,
                            rtol=0)


@pytest.mark.parametrize("input_1_type", MANY_TYPES)
@pytest.mark.parametrize("input_2_type", MANY_TYPES)
@pytest.mark.parametrize("output_type", MANY_TYPES)
def test_many_implicit_cast(input_1_type, input_2_type, output_type):

    model = get_simple_adder(output_type)
    t1 = torch.tensor([1.0, 25., -1.0, 83.], dtype=input_1_type)
    t2 = torch.tensor([2.0, 35., 1.0, 32.4], dtype=input_2_type)

    helpers.assert_allclose(actual=model(t1, t2),
                            expected=torch.tensor([3., 60., 0., 115.4]),
                            atol=0.5,
                            rtol=0)


def get_unpack_clamp():
    class UnpackClamp(nn.Module):
        def forward(self, x):
            i, _ = x
            return i.clamp(-1, 1)

    return poptorch.inferenceModel(UnpackClamp())


@pytest.mark.parametrize("input_type", MANY_TYPES)
def test_clamp_many_types(input_type):
    model = get_unpack_clamp()
    x = torch.tensor([[-2, -1, 0, 1, 2], [0, 0, 0, 0, 0]], dtype=input_type)

    y = model(x)

    np.testing.assert_allclose(y.numpy(), np.array([-1, -1, 0, 1, 1]))


def get_simple_add_two():
    class GetSimpleAddTwo(nn.Module):
        def forward(self, x):
            return x + 2

    return poptorch.inferenceModel(GetSimpleAddTwo())


@pytest.mark.parametrize("input_type", MANY_TYPES)
def test_add_two_many_types(input_type):
    model = get_simple_add_two()

    t = torch.tensor([1.0, 25., -1.0, 83.], dtype=input_type)
    helpers.assert_allclose(actual=model(t),
                            expected=torch.tensor([3.0, 27., 1, 85.]),
                            atol=0.5,
                            rtol=0)


def get_simple_incrementer(constant_type, return_type):
    class SimpleIncrementer(nn.Module):
        def forward(self, x):
            return (x + torch.tensor(1, dtype=constant_type)).type(return_type)

    return poptorch.inferenceModel(SimpleIncrementer())


@pytest.mark.parametrize("input_type", MANY_TYPES)
@pytest.mark.parametrize("constant_type", MANY_TYPES)
@pytest.mark.parametrize("output_type", MANY_TYPES)
def test_many_constant_implicit_cast(input_type, constant_type, output_type):
    #Will not trace
    if constant_type == torch.float16:
        return

    model = get_simple_incrementer(constant_type, output_type)
    t = torch.tensor([1.0, 25., -1.0, 83.], dtype=input_type)

    helpers.assert_allclose(actual=model(t),
                            expected=torch.tensor([2.0, 26., 0, 84.]),
                            atol=0.5,
                            rtol=0)


@pytest.mark.parametrize("input_1_type", MANY_TYPES)
@pytest.mark.parametrize("input_2_type", MANY_TYPES)
def test_many_implicit_cast_greater_than(input_1_type, input_2_type):
    class GreaterThan(nn.Module):
        def forward(self, x, y):
            return x > y

    model = poptorch.inferenceModel(GreaterThan())

    t1 = torch.tensor([1, -1, 2.0, 550.4], dtype=input_1_type)
    t2 = torch.tensor([2.4, 2, 1.0, 32.4], dtype=input_2_type)

    helpers.assert_allequal(actual=model(t1, t2),
                            expected=torch.tensor([False, False, True, True]))


@pytest.mark.parametrize("input_type", MANY_TYPES)
def test_many_implicit_cast_greater_than_one(input_type):
    class GreaterThanOne(nn.Module):
        def forward(self, x):
            return x > 1

    model = poptorch.inferenceModel(GreaterThanOne())

    t = torch.tensor([2.5, -1, 2.0, 550.4], dtype=input_type)

    helpers.assert_allequal(actual=model(t),
                            expected=torch.tensor([True, False, True, True]))


@pytest.mark.parametrize("input_1_type", MANY_TYPES)
@pytest.mark.parametrize("input_2_type", MANY_TYPES)
def test_many_implicit_cast_equals(input_1_type, input_2_type):
    class Equals(nn.Module):
        def forward(self, x, y):
            return x == y

    model = poptorch.inferenceModel(Equals())

    t1 = torch.tensor([1, -1, 2.0, 550.4], dtype=input_1_type)
    t2 = torch.tensor([2.4, 2, 2.0, 550.4], dtype=input_2_type)

    depends = False

    if (input_1_type == torch.float16 and input_2_type == torch.float16):
        depends = True

    if (input_1_type in (torch.float32, torch.float64)
            and input_2_type in (torch.float32, torch.float64)):
        depends = True

    if (input_1_type in (torch.int32, torch.int64)
            and input_2_type in (torch.int32, torch.int64)):
        depends = True

    helpers.assert_allequal(actual=model(t1, t2),
                            expected=torch.tensor(
                                [False, False, True, depends]))


@pytest.mark.parametrize("input_type", MANY_TYPES)
def test_many_implicit_cast_equals_one(input_type):
    class EqualsOne(nn.Module):
        def forward(self, x):
            return x == 1

    model = poptorch.inferenceModel(EqualsOne())

    t = torch.tensor([2.5, 1, 2.0, 550.4], dtype=input_type)

    helpers.assert_allequal(actual=model(t),
                            expected=torch.tensor([False, True, False, False]))


@pytest.mark.parametrize("input_1_type", MANY_TYPES)
@pytest.mark.parametrize("input_2_type", MANY_TYPES)
def test_many_implicit_cast_less_than(input_1_type, input_2_type):
    class LessThan(nn.Module):
        def forward(self, x, y):
            return x < y

    model = poptorch.inferenceModel(LessThan())

    t1 = torch.tensor([1, -1, 2.0, 550.4], dtype=input_1_type)
    t2 = torch.tensor([2.4, 2, 1.0, 32.4], dtype=input_2_type)

    helpers.assert_allequal(actual=model(t1, t2),
                            expected=torch.tensor([True, True, False, False]))


@pytest.mark.parametrize("input_type", MANY_TYPES)
def test_many_implicit_cast_less_than_one(input_type):
    class LessThanOne(nn.Module):
        def forward(self, x):
            return x < 1

    model = poptorch.inferenceModel(LessThanOne())

    t = torch.tensor([2.5, -1, 2.0, 550.4], dtype=input_type)

    helpers.assert_allequal(actual=model(t),
                            expected=torch.tensor([False, True, False, False]))


@pytest.mark.parametrize("input_type", MANY_TYPES)
def test_many_implicit_cast_one_less_than(input_type):
    class OneLessThan(nn.Module):
        def forward(self, x):
            return 1 < x  # pylint: disable=misplaced-comparison-constant

    model = poptorch.inferenceModel(OneLessThan())

    t = torch.tensor([2.5, -1, 2.0, 550.4], dtype=input_type)

    helpers.assert_allequal(actual=model(t),
                            expected=torch.tensor([True, False, True, True]))


@pytest.mark.parametrize("input_type", [torch.int8, torch.uint8, torch.int16])
def test_small_int(input_type):
    class Model(nn.Module):
        def forward(self, x):
            return x.float()

    input = torch.arange(100)

    # Convert to desired input type
    input = input.to(input_type)

    model = poptorch.inferenceModel(Model())

    output = model(input)

    assert output.dtype == torch.float
    helpers.assert_allequal(actual=output, expected=input.float())


@pytest.mark.parametrize("input_type", [torch.int8, torch.uint8, torch.int16])
def test_small_int_return(input_type):
    class Model(nn.Module):
        def forward(self, x):
            return x, x.float() + x.float()

    input = torch.arange(100)

    # Convert to desired input/output type
    input = input.to(input_type)

    model = poptorch.inferenceModel(Model())

    output, _ = model(input)

    assert output.dtype == input_type
    helpers.assert_allequal(actual=output, expected=input)


def test_tuple_and_list_constant():
    class Model(torch.nn.Module):
        def forward(self):
            const1 = torch.tensor([1., 2.])
            const2 = torch.tensor([3., 4.])

            return torch.tensor(1), const1 + const2, [const1, const2]

    model = Model()
    inference_model = poptorch.inferenceModel(model)

    poptorch_out = inference_model()
    native = model()
    helpers.assert_allclose(actual=poptorch_out, expected=native)


def test_tuple_and_list_constant_double_nested():
    class Model(torch.nn.Module):
        def forward(self):
            const1 = torch.tensor([1., 2.])
            const2 = torch.tensor([3., 4.])

            return ([torch.tensor(1)], const1 + const2,
                    ([const1, const2], [const1, const2]), const2)

    model = Model()
    inference_model = poptorch.inferenceModel(model)

    poptorch_out = inference_model()
    native = model()
    helpers.assert_allclose(actual=poptorch_out, expected=native)


================================================
FILE: tests/weights_writing_test.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import os
import types
import copy
import tempfile
import unittest.mock

import numpy as np
import pytest
import torch
import torch.optim as optim
import helpers
import poptorch

# pragma pylint: disable=unsubscriptable-object


class ModelWithLoss(torch.nn.Module):
    def __init__(self, loss, use_dropout=False):
        super().__init__()
        self.linear = torch.nn.Linear(10, 10)
        self.loss = loss
        if use_dropout:
            self.dropout = torch.nn.Dropout()
        else:
            self.dropout = lambda x: x

    def forward(self, data, target=None):
        out = self.dropout(self.linear(data))

        if target is None:
            return out

        loss = self.loss(out, target)
        return out, loss


@pytest.mark.parametrize("use_half", [True, False])
def test_training_and_inference(use_half):
    torch.manual_seed(42)

    # 10 Batches of 10.
    input = torch.randn(10, 10)

    # 10 batches of 1
    label = torch.randint(0, 10, [1])
    label = label.expand([10])
    model = ModelWithLoss(torch.nn.CrossEntropyLoss())

    if use_half:
        model.half()
        input = input.half()

    # Run on IPU batch size 1 * 10 popart batches.
    opts = poptorch.Options().deviceIterations(10)

    training = poptorch.trainingModel(model, options=opts)
    inference = poptorch.inferenceModel(model, options=opts)

    # Run all 10 batches as batchsize 10.
    out = inference(input)

    # Sanity check we weren't already matching the label.
    assert not torch.equal(torch.argmax(out.int(), dim=1), label)

    for _ in range(0, 1000):
        _, loss = training(input, label)

        # Each batch should NOT report its own loss. As by default training
        # model should have a "Final" output mode.
        assert len(loss.size()) == 0

    # Run with trained weights.
    out = inference(input)

    # Check we are now equal with labels.
    helpers.assert_allequal(actual=torch.argmax(out.int(), dim=1),
                            expected=label)


@pytest.mark.parametrize("use_half", [True, False])
def test_training_inference_parameters(use_half):
    torch.manual_seed(42)

    # 10 Batches of 10.
    input = torch.randn(10, 10)

    # 10 batches of 1
    label = torch.randint(0, 10, [1])
    label = label.expand([10])

    model = ModelWithLoss(torch.nn.CrossEntropyLoss())

    if use_half:
        model.half()
        input = input.half()

    # Run on IPU batch size 1 * 10 popart batches.
    opts = poptorch.Options().deviceIterations(10)
    inference = poptorch.inferenceModel(model, opts)
    training = poptorch.trainingModel(model, options=opts)
    inference = poptorch.inferenceModel(model)

    # Run all 10 batches as batchsize 10.
    out = inference(input)

    # Sanity check we weren't already matching the label.
    assert not torch.equal(torch.argmax(out.int(), dim=1), label)

    for _ in range(0, 1000):
        _, loss = training(input, label)

        # Each batch should NOT report its own loss. As by default training model should have a "Final" output mode.
        assert len(loss.size()) == 0

    # This will trigger copyWeightsToHost()
    for _ in model.named_parameters():
        pass

    # Run with trained weights.
    out = inference(input)

    # Check we are now equal with labels.
    helpers.assert_allequal(actual=torch.argmax(out.int(), dim=1),
                            expected=label)


@pytest.mark.parametrize("use_half", [True, False])
def test_access_parameters(use_half):
    torch.manual_seed(42)

    # 10 Batches of 10.
    input = torch.randn(10, 10)

    # 10 batches of 1
    label = torch.randint(0, 10, [1])
    label = label.expand([10])

    model = ModelWithLoss(torch.nn.CrossEntropyLoss())

    if use_half:
        model.half()
        input = input.half()

    # Run on IPU batch size 1 * 10 popart batches.
    opts = poptorch.Options().deviceIterations(10)
    poptorch_model = poptorch.trainingModel(model, options=opts)

    original_weights = str(model.linear.weight)
    inference = poptorch.inferenceModel(model)

    # Run all 10 batches as batchsize 10.
    out = inference(input)

    assert original_weights == str(model.linear.weight)

    # Sanity check we weren't already matching the label.
    assert not torch.equal(torch.argmax(out.int(), dim=1), label)

    for _ in range(0, 1000):
        _, loss = poptorch_model(input, label)

        # Each batch should NOT report its own loss. As by default training model should have a "Final" output mode.
        assert len(loss.size()) == 0

    assert original_weights != str(poptorch_model.model.linear.weight)

    # Run with trained weights.
    out = inference(input)

    # Check we are now equal with labels.
    helpers.assert_allequal(actual=torch.argmax(out.int(), dim=1),
                            expected=label)


class DummyTrainingModel(torch.nn.Module):
    """
    Dummy training model
    """

    def __init__(self):
        super().__init__()
        self.conv = torch.nn.Conv2d(16, 4, (3, 3))
        self.loss = torch.nn.NLLLoss()
        self.batch_norm = torch.nn.BatchNorm2d(4)
        self.softmax = torch.nn.LogSoftmax(dim=1)

    def forward(self, x, target):
        x = self.conv(x)
        x = self.batch_norm(x)
        x = self.softmax(x)
        return self.loss(x, target)


def test_torch_save():
    torch.manual_seed(42)

    # create a dummy model
    model = DummyTrainingModel()

    # create optimizer
    optimizer = optim.SGD(model.parameters(), lr=0.01)

    # store the weights before training
    pre_train_weights = copy.deepcopy(model.state_dict()['conv.weight'])

    # wrap it in a trainingModel
    training_model = poptorch.trainingModel(model, optimizer=optimizer)

    # run on dummy data for one iteration
    input = torch.randn(5, 16, 10, 10)
    target = torch.empty(5, 8, 8, dtype=torch.long).random_(0, 4)
    _ = training_model(input, target)

    with tempfile.TemporaryDirectory(dir=".") as d:
        model_file = os.path.join(d, "model.save")
        # save the model
        torch.save(model, model_file)

        # reload the model
        reloaded_model = torch.load(model_file)

    # make sure the reloaded weights are the same as the
    # model and trainingModel
    assert np.allclose(model.state_dict()['conv.weight'],
                       reloaded_model.state_dict()['conv.weight'])
    assert np.allclose(model.state_dict()['conv.weight'],
                       training_model.state_dict()['conv.weight'])

    # make sure we actually trained and we are not just checking
    # the original wrapped model weights
    assert not np.allclose(model.state_dict()['conv.weight'],
                           pre_train_weights)


@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("DEBUG")
@pytest.mark.ipuHardwareRequired
def test_seed_precompilation(capfd):
    # create a dummy model
    model = ModelWithLoss(torch.nn.CrossEntropyLoss(), use_dropout=True)

    # create optimizer
    optimizer = poptorch.optim.SGD(model.parameters(), lr=0.01)

    opts = poptorch.Options().randomSeed(42)
    opts.useOfflineIpuTarget(poptorch.ipuHardwareVersion())
    training_model = poptorch.trainingModel(model,
                                            options=opts,
                                            optimizer=optimizer)
    # 10 Batches of 10.
    input = torch.randn(10, 10)

    # 10 batches of 1
    label = torch.randint(0, 10, [1])
    label = label.expand([10])
    training_model.compile(input, label)

    # Clear the outputs (We only want to parse what's triggered by save()
    helpers.LogChecker(capfd)

    with tempfile.TemporaryDirectory() as d:
        path = os.path.join(d, "checkpoint.pt")
        training_model.save(path)

        # Creating a checkpoint should trigger copies of the weights, optimizer state
        # random seed and rng state but as we're using an offline target nothing
        # should happen.
        log = helpers.LogChecker(capfd)
        log.assert_no_matches("Reading random seed")
        log.assert_no_matches("Reading RNG state")
        log.assert_no_matches("Implicit copyWeightsToHost()")
        log.assert_no_matches(
            "Writing optimiser state tensors from IPU to host.")

        poptorch.load(path)

        log = helpers.LogChecker(capfd)
        log.assert_matches("Writing weights from host to IPU memory")
        log.assert_matches("Setting random seed to")
        # We haven't run on HW so we don't have a state yet
        log.assert_no_matches("Setting RNG state")
        log.assert_no_matches(
            "Writing optimiser state tensors from host to IPU memory")


@helpers.printCapfdOnExit
@helpers.overridePoptorchLogLevel("DEBUG")
@pytest.mark.ipuHardwareRequired
def test_save_everything(capfd, caplog):
    # create a dummy model
    model = ModelWithLoss(torch.nn.CrossEntropyLoss(), use_dropout=True)

    # create optimizer
    optimizer = poptorch.optim.SGD(model.parameters(), lr=0.01)

    opts = poptorch.Options().randomSeed(42)
    training_model = poptorch.trainingModel(model,
                                            options=opts,
                                            optimizer=optimizer)
    # 10 Batches of 10.
    input = torch.randn(10, 10)

    # 10 batches of 1
    label = torch.randint(0, 10, [1])
    label = label.expand([10])
    first_out, first_loss = training_model(input, label)

    # Clear the outputs (We only want to parse what's triggered by save()
    helpers.LogChecker(capfd)

    origin_out = []
    loaded_out = []
    with tempfile.TemporaryDirectory() as d:
        path = os.path.join(d, "checkpoint.pt")
        training_model.save(path)

        # Creating a checkpoint should trigger copies of the weights, optimizer state
        # random seed and rng state.
        log = helpers.LogChecker(capfd)
        log.assert_matches("Reading random seed")
        log.assert_matches("Reading RNG state")
        log.assert_matches("Writing optimiser state tensors from IPU to host.")

        log = helpers.LogChecker(caplog)
        log.assert_matches("Implicit copyWeightsToHost()")

        origin_out.append(training_model(input, label))

        loaded = poptorch.load(path)

        log = helpers.LogChecker(capfd)
        log.assert_matches("Writing weights from host to IPU memory")
        log.assert_matches("Setting random seed to")
        log.assert_matches("Setting RNG state")
        log.assert_matches(
            "Writing optimiser state tensors from host to IPU memory")

        loaded_out.append(loaded(input, label))
        origin_out.append(training_model(input, label))
        # Everything is loaded: there shouldn't be any transfer
        log = helpers.LogChecker(capfd)
        log.assert_no_matches("Writing weights from host to IPU memory")
        log.assert_no_matches("Implicit copyWeightsToHost()")
        log.assert_no_matches("random seed")
        log.assert_no_matches("RNG state")
        log.assert_no_matches("Writing optimiser state tensors from")

        loaded.detachFromDevice()
        log = helpers.LogChecker(capfd)
        log.assert_matches("Writing weights from IPU to host")
        log.assert_matches("Writing optimiser state tensors from IPU to host")
        log.assert_matches("Reading random seed")
        log.assert_matches("Reading RNG state")
        log.assert_matches("Detached from device")

        loaded_out.append(loaded(input, label))
        log = helpers.LogChecker(capfd)
        log.assert_matches("Writing weights from host to IPU memory")
        log.assert_matches(
            "Writing optimiser state tensors from host to IPU memory")
        log.assert_matches("Setting random seed to")
        log.assert_matches("Setting RNG state")

    for (out, loss), (load_out, load_loss) in zip(origin_out, loaded_out):
        helpers.assert_allclose(expected=out, actual=load_out)
        assert loss == load_loss
        assert not torch.allclose(out, first_out, rtol=1e-02, atol=1e-02)
        assert loss != first_loss


def train_and_check_weight_sharing_ipu_cpu(model, training_model, input,
                                           target, original_parameters):
    # Make sure the first run doesn't already pass the test.
    original, _ = training_model(input, target)
    assert not torch.allclose(original, target, rtol=1e-02, atol=1e-02)

    # Train on IPU.
    for _ in range(0, 1000):
        out, _ = training_model(input, target)

    assert training_model.deviceToHostCounter == 0, \
            "No implicit copy needed to train the model"

    # Run without copying the weights and check they've been automatically updated.
    native_out = model(input)
    helpers.assert_allclose(expected=native_out, actual=out)
    assert training_model.deviceToHostCounter == 1, \
            "1 implicit copy after having trained the model"
    training_model.deviceToHostCounter = 0  # reset counter

    current_parameters = str(list(model.parameters()))
    assert original_parameters != current_parameters
    assert training_model.deviceToHostCounter == 0, \
            "No implicit copy needed to access the parameters after inference"
    last_parameters = current_parameters

    native_out = model(input)
    helpers.assert_allclose(expected=native_out, actual=out)
    assert training_model.deviceToHostCounter == 0, \
            "No implicit copy needed after inference"

    current_parameters = str(list(model.parameters()))
    assert last_parameters == current_parameters
    assert training_model.deviceToHostCounter == 0, \
            "No implicit copy needed to access the parameters after inference"


def test_weights_sharing_ipu_cpu():
    torch.manual_seed(42)

    model = ModelWithLoss(torch.nn.MSELoss())

    training_model = poptorch.trainingModel(model)

    training_model.deviceToHostCounter = 0
    realMethod = training_model.copyWeightsToHost

    original_parameters = str(list(model.parameters()))

    def deviceToHostWrapper(model):
        model.deviceToHostCounter += 1
        realMethod()

    training_model.copyWeightsToHost = types.MethodType(
        deviceToHostWrapper, training_model)

    # Same model as above, they will share weights (in 'model') which once training is finished can be copied back.
    target = torch.randn(10)
    input = torch.randn(10)

    train_and_check_weight_sharing_ipu_cpu(model, training_model, input,
                                           target, original_parameters)

    # Train on IPU.
    for _ in range(0, 50):
        out, _ = training_model(input, target)

    current_parameters = str(list(model.parameters()))
    assert training_model.deviceToHostCounter == 1, \
            "1 implicit copy after having trained the model"
    assert original_parameters != current_parameters
    training_model.deviceToHostCounter = 0  # reset counter

    for _ in range(0, 50):
        out, _ = training_model(input, target)

    # Access a parameter directly:
    print(model.linear.weight.data)

    assert training_model.deviceToHostCounter == 1, \
            "1 implicit copy after having trained the model"
    training_model.deviceToHostCounter = 0  # reset counter

    for _ in range(0, 50):
        out, _ = training_model(input, target)

    # Check state_dict works: torch.save(model.state_dict(), "/tmp/model.save")
    model.state_dict()

    assert training_model.deviceToHostCounter == 1, \
            "1 implicit copy after having trained the model"
    training_model.deviceToHostCounter = 0  # reset counter

    for _ in range(0, 50):
        out, _ = training_model(input, target)

    assert training_model.deviceToHostCounter == 0, \
            "No implicit copy needed to train the model"

    # Run without copying the weights and check they've been automatically updated.
    native_out = model(input)
    helpers.assert_allclose(expected=native_out, actual=out)
    assert training_model.deviceToHostCounter == 1, \
            "1 implicit copy after having trained the model"
    training_model.deviceToHostCounter = 0  # reset counter

    native_out = model(input)
    helpers.assert_allclose(expected=native_out, actual=out)
    assert training_model.deviceToHostCounter == 0, \
            "No implicit copy needed after inference"

    # Check we have trained the "model"
    helpers.assert_allclose(expected=native_out,
                            actual=target,
                            rtol=1e-02,
                            atol=1e-02)


def train_N_times_and_check_copying(N, inference_model, training_model, input,
                                    target):
    # Train on IPU.
    for _ in range(0, N):
        out, _ = training_model(input, target)

    assert training_model.deviceToHostCounter == 0, \
            "No implicit copy needed to train the model"

    # Run without copying the weights and check they've been automatically updated.
    out_inference = inference_model(input)
    helpers.assert_allclose(expected=out, actual=out_inference)
    assert training_model.deviceToHostCounter == 1, \
            "1 implicit copy after having trained the model"
    training_model.deviceToHostCounter = 0  # reset counter

    out_inference = inference_model(input)
    helpers.assert_allclose(expected=out, actual=out_inference)
    assert training_model.deviceToHostCounter == 0, \
            "No implicit copy needed after inference"

    return out_inference


def test_weights_sharing_ipus():
    torch.manual_seed(42)

    model = ModelWithLoss(torch.nn.MSELoss())

    training_model = poptorch.trainingModel(model)

    training_model.deviceToHostCounter = 0
    realMethod = training_model.copyWeightsToHost

    def deviceToHostWrapper(model):
        model.deviceToHostCounter += 1
        realMethod()

    training_model.copyWeightsToHost = types.MethodType(
        deviceToHostWrapper, training_model)

    # Same model as above, they will share weights (in 'model') which once training is finished can be copied back.
    inference_model = poptorch.inferenceModel(model)
    target = torch.randn(10)
    input = torch.randn(10)

    out_inference = inference_model(input)
    assert not torch.allclose(out_inference, target, rtol=1e-02, atol=1e-02)

    # Make sure the first run doesn't already pass the test.
    original, _ = training_model(input, target)
    assert not torch.allclose(original, target, rtol=1e-02, atol=1e-02)

    train_N_times_and_check_copying(1000, inference_model, training_model,
                                    input, target)
    out_inference = train_N_times_and_check_copying(1500, inference_model,
                                                    training_model, input,
                                                    target)

    helpers.assert_allclose(actual=out_inference,
                            expected=target,
                            rtol=1e-02,
                            atol=1e-02)


def test_implicit_first_time_copy():
    torch.manual_seed(42)

    # Train on host.
    model = torch.nn.Linear(10, 10)
    target = torch.randn(10)
    input = torch.randn(10)

    loss_function = torch.nn.MSELoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01)

    model.eval()
    # Make sure the first run doesn't already pass the test.
    native = model(input)
    assert not torch.allclose(native, target, rtol=1e-02, atol=1e-02)

    model.train()
    for _ in range(0, 2500):
        optimizer.zero_grad()

        # Run model.
        outputs = model(input)

        # Back prop loss.
        loss = loss_function(target, outputs)
        loss.backward()
        optimizer.step()

    # Check the model is now trained
    model.eval()
    native = model(input)
    helpers.assert_allclose(actual=native,
                            expected=target,
                            rtol=1e-02,
                            atol=1e-02)

    # Run on IPU.
    ipuModel = poptorch.inferenceModel(model)
    poptorch_out = ipuModel(input)

    # Check IPU returns same value as native without the weights explicitly being copied.
    helpers.assert_allclose(expected=native, actual=poptorch_out)
    helpers.assert_allclose(actual=poptorch_out,
                            expected=target,
                            rtol=1e-02,
                            atol=1e-02)


def test_implicit_first_time_copy_negative():
    torch.manual_seed(42)

    # Train on host.
    model = torch.nn.Linear(10, 10)
    target = torch.randn(10)
    input = torch.randn(10)

    loss_function = torch.nn.MSELoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01)

    model.eval()
    # Make sure the first run doesn't already pass the test.
    native = model(input)
    assert not torch.allclose(native, target, rtol=1e-02, atol=1e-02)

    # Run on IPU.
    poptorch_model = poptorch.inferenceModel(model)
    poptorch_out = poptorch_model(input)

    # Weights should be copied so check we are matching host but NOT the target.
    helpers.assert_allclose(expected=native, actual=poptorch_out)
    assert not torch.allclose(native, target, rtol=1e-02, atol=1e-02)

    model.train()
    for _ in range(0, 2500):
        optimizer.zero_grad()

        # Run model.
        outputs = model(input)

        # Back prop loss.
        loss = loss_function(target, outputs)
        loss.backward()
        optimizer.step()

    # Check the model is now trained
    model.eval()
    native = model(input)
    helpers.assert_allclose(actual=native,
                            expected=target,
                            rtol=1e-02,
                            atol=1e-02)

    # Without recompilation or copying the weights check we are matching neither host nor the target.
    poptorch_out = poptorch_model(input)

    # Check IPU *does not* return the same value as native
    assert not torch.allclose(poptorch_out, native)
    assert not torch.allclose(poptorch_out, target, rtol=1e-02, atol=1e-02)


def test_weight_overwrite_trained_weight():
    torch.manual_seed(42)

    model = ModelWithLoss(torch.nn.MSELoss())

    poptorch_model = poptorch.trainingModel(model)

    target = torch.randn(10)
    input = torch.randn(10)

    # Make sure the first run doesn't already pass the test.
    original, loss = poptorch_model(input, target)
    assert not torch.allclose(original, target, rtol=1e-02, atol=1e-02)

    # Train on IPU.
    for _ in range(0, 2500):
        trained_out, trained_loss = poptorch_model(input, target)

    # Check we have trained the "model"
    helpers.assert_allclose(actual=trained_out,
                            expected=target,
                            rtol=1e-02,
                            atol=1e-02)

    # Overwrite the trained weights with weights from host.
    poptorch_model.copyWeightsToDevice()

    # Don't train them.
    poptorch_model.setOptimizer(optim.SGD(model.parameters(), lr=0.0))

    out, loss = poptorch_model(input, target)
    host_out = model(input)

    # Check we are no longer trained.
    assert not torch.allclose(out, target, rtol=1e-02, atol=1e-02)
    assert not torch.allclose(loss, trained_loss)

    helpers.assert_allclose(expected=host_out, actual=out)


@pytest.mark.parametrize("use_half", [True, False])
def test_access_scalar_parameter(use_half):
    class ExampleModel(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.bias = torch.nn.Parameter(torch.zeros(()))

        def forward(self, x):
            x = x + 1

            # It is important to make sure the result of the print is used.
            x = poptorch.ipu_print_tensor(x)

            return x + self.bias

    def custom_loss(output, target):
        # Mean squared error with a scale
        loss = output - target
        loss = loss * loss * 5
        return poptorch.identity_loss(loss, reduction="mean")

    class ExampleModelWithCustomLoss(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.model = ExampleModel()

        def forward(self, input, target=None):
            out = self.model(input)
            if target is not None:
                return out, custom_loss(out, target)
            return out

    model = ExampleModelWithCustomLoss()
    input = torch.tensor([1.0, 2.0, 3.0])
    target = torch.tensor([30.0, 40.0, 50.0])
    if use_half:
        model.half()
        input = input.half()
        target = target.half()
    poptorch_model = poptorch.trainingModel(model)
    original_bias = str(poptorch_model.model.model.bias)

    for _ in range(10):
        poptorch_model(input=input, target=target)

    updated_bias = str(poptorch_model.model.model.bias)
    assert original_bias != updated_bias

    poptorch_model.copyWeightsToHost()
    # Bias should already be up to date
    assert updated_bias == str(poptorch_model.model.model.bias)


@pytest.mark.parametrize("reverse_equal_call", [True, False])
def test_copy_on_torch_equal(reverse_equal_call):
    torch.manual_seed(42)

    model = ModelWithLoss(torch.nn.MSELoss())

    poptorch_model = poptorch.trainingModel(model,
                                            optimizer=torch.optim.SGD(
                                                model.parameters(), lr=0.01))

    target = torch.ones(10)
    input = torch.randn(10)

    weight_at_start = model.linear.weight.clone().data

    for _ in range(100):
        poptorch_model(input, target)

    if reverse_equal_call:
        assert not torch.equal(model.linear.weight, weight_at_start)
    else:
        assert not torch.equal(weight_at_start, model.linear.weight)


def test_copy_after_compile():
    torch.manual_seed(42)

    model = ModelWithLoss(torch.nn.MSELoss())

    poptorch_model = poptorch.trainingModel(model,
                                            optimizer=torch.optim.SGD(
                                                model.parameters(), lr=0.01))

    target = torch.ones(10)
    input = torch.randn(10)

    poptorch_model.compile(input, target)

    # If we haven't copied the weights, Popart will fire an exception
    # when trying to execute the model.
    poptorch_model(input, target)


def test_torch_save_unwrapped():
    torch.manual_seed(42)

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.conv = torch.nn.Conv2d(2, 2, 1, padding=0)
            self.register_buffer("test_buffer",
                                 torch.zeros([2], dtype=torch.float32))
            self.register_parameter("test_param",
                                    torch.nn.Parameter(torch.empty(10)))
            self.loss = torch.nn.L1Loss()

        def forward(self, inp):
            out = self.conv(inp)
            loss = self.loss(out)
            return out, loss

    model = Model()
    # Only training models instrument the model so we can't use poptporch.inferenceModel
    poptorch.trainingModel(model)

    # An inference model sharing its user model with a training model will be instrumented though.
    poptorch.inferenceModel(model)

    with tempfile.TemporaryDirectory() as tmp:
        torch_file = os.path.join(tmp, "torch_saved.pt")
        torch.save(model.state_dict(), torch_file)

        # Ensure the state dictionaries returned by the training and inference models don't contain any PopTorch wrapper.
        with unittest.mock.patch.object(
                poptorch._impl,  # pylint: disable=protected-access
                "_restoreWrapperIfNecessary",
                wraps=poptorch._impl._restoreWrapperIfNecessary  # pylint: disable=protected-access
        ) as restore_fn:
            torch.load(torch_file)
            restore_fn.assert_not_called()


================================================
FILE: version.json
================================================
{"major": "3", "minor": "4", "point": "0"}